blob: 99e94283816cd2f6eec8848dea657fa4198ec52d [file] [log] [blame]
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001/*
2*******************************************************************************
Jungshik Shin70f82502016-01-29 00:32:36 -08003* Copyright (C) 2004-2015, International Business Machines
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00004* Corporation and others. All Rights Reserved.
5*******************************************************************************
6* file name: uregex.cpp
7*/
8
9#include "unicode/utypes.h"
10
11#if !UCONFIG_NO_REGULAR_EXPRESSIONS
12
13#include "unicode/regex.h"
14#include "unicode/uregex.h"
15#include "unicode/unistr.h"
16#include "unicode/ustring.h"
17#include "unicode/uchar.h"
18#include "unicode/uobject.h"
19#include "unicode/utf16.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000020#include "cmemory.h"
Jungshik Shin70f82502016-01-29 00:32:36 -080021#include "uassert.h"
22#include "uhash.h"
23#include "umutex.h"
24#include "uvectr32.h"
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000025
26#include "regextxt.h"
27
jshin@chromium.org6f31ac32014-03-26 22:15:14 +000028U_NAMESPACE_BEGIN
29
30#define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
31
32struct RegularExpression: public UMemory {
33public:
34 RegularExpression();
35 ~RegularExpression();
36 int32_t fMagic;
37 RegexPattern *fPat;
38 u_atomic_int32_t *fPatRefCount;
39 UChar *fPatString;
40 int32_t fPatStringLen;
41 RegexMatcher *fMatcher;
42 const UChar *fText; // Text from setText()
43 int32_t fTextLength; // Length provided by user with setText(), which
44 // may be -1.
45 UBool fOwnsText;
46};
47
48static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
49
50RegularExpression::RegularExpression() {
51 fMagic = REXP_MAGIC;
52 fPat = NULL;
53 fPatRefCount = NULL;
54 fPatString = NULL;
55 fPatStringLen = 0;
56 fMatcher = NULL;
57 fText = NULL;
58 fTextLength = 0;
59 fOwnsText = FALSE;
60}
61
62RegularExpression::~RegularExpression() {
63 delete fMatcher;
64 fMatcher = NULL;
65 if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
66 delete fPat;
67 uprv_free(fPatString);
68 uprv_free((void *)fPatRefCount);
69 }
70 if (fOwnsText && fText!=NULL) {
71 uprv_free((void *)fText);
72 }
73 fMagic = 0;
74}
75
76U_NAMESPACE_END
77
78U_NAMESPACE_USE
79
80//----------------------------------------------------------------------------------------
81//
82// validateRE Do boilerplate style checks on API function parameters.
83// Return TRUE if they look OK.
84//----------------------------------------------------------------------------------------
85static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) {
86 if (U_FAILURE(*status)) {
87 return FALSE;
88 }
89 if (re == NULL || re->fMagic != REXP_MAGIC) {
90 *status = U_ILLEGAL_ARGUMENT_ERROR;
91 return FALSE;
92 }
93 // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
94 if (requiresText && re->fText == NULL && !re->fOwnsText) {
95 *status = U_REGEX_INVALID_STATE;
96 return FALSE;
97 }
98 return TRUE;
99}
100
101//----------------------------------------------------------------------------------------
102//
103// uregex_open
104//
105//----------------------------------------------------------------------------------------
106U_CAPI URegularExpression * U_EXPORT2
107uregex_open( const UChar *pattern,
108 int32_t patternLength,
109 uint32_t flags,
110 UParseError *pe,
111 UErrorCode *status) {
112
113 if (U_FAILURE(*status)) {
114 return NULL;
115 }
116 if (pattern == NULL || patternLength < -1 || patternLength == 0) {
117 *status = U_ILLEGAL_ARGUMENT_ERROR;
118 return NULL;
119 }
120 int32_t actualPatLen = patternLength;
121 if (actualPatLen == -1) {
122 actualPatLen = u_strlen(pattern);
123 }
124
125 RegularExpression *re = new RegularExpression;
126 u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
127 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
128 if (re == NULL || refC == NULL || patBuf == NULL) {
129 *status = U_MEMORY_ALLOCATION_ERROR;
130 delete re;
131 uprv_free((void *)refC);
132 uprv_free(patBuf);
133 return NULL;
134 }
135 re->fPatRefCount = refC;
136 *re->fPatRefCount = 1;
137
138 //
139 // Make a copy of the pattern string, so we can return it later if asked.
140 // For compiling the pattern, we will use a UText wrapper around
141 // this local copy, to avoid making even more copies.
142 //
143 re->fPatString = patBuf;
144 re->fPatStringLen = patternLength;
145 u_memcpy(patBuf, pattern, actualPatLen);
146 patBuf[actualPatLen] = 0;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800147
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000148 UText patText = UTEXT_INITIALIZER;
149 utext_openUChars(&patText, patBuf, patternLength, status);
150
151 //
152 // Compile the pattern
153 //
154 if (pe != NULL) {
155 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
156 } else {
157 re->fPat = RegexPattern::compile(&patText, flags, *status);
158 }
159 utext_close(&patText);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800160
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000161 if (U_FAILURE(*status)) {
162 goto ErrorExit;
163 }
164
165 //
166 // Create the matcher object
167 //
168 re->fMatcher = re->fPat->matcher(*status);
169 if (U_SUCCESS(*status)) {
170 return (URegularExpression*)re;
171 }
172
173ErrorExit:
174 delete re;
175 return NULL;
176
177}
178
179//----------------------------------------------------------------------------------------
180//
181// uregex_openUText
182//
183//----------------------------------------------------------------------------------------
184U_CAPI URegularExpression * U_EXPORT2
185uregex_openUText(UText *pattern,
186 uint32_t flags,
187 UParseError *pe,
188 UErrorCode *status) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800189
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000190 if (U_FAILURE(*status)) {
191 return NULL;
192 }
193 if (pattern == NULL) {
194 *status = U_ILLEGAL_ARGUMENT_ERROR;
195 return NULL;
196 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800197
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000198 int64_t patternNativeLength = utext_nativeLength(pattern);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800199
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000200 if (patternNativeLength == 0) {
201 *status = U_ILLEGAL_ARGUMENT_ERROR;
202 return NULL;
203 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800204
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000205 RegularExpression *re = new RegularExpression;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800206
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000207 UErrorCode lengthStatus = U_ZERO_ERROR;
208 int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800209
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000210 u_atomic_int32_t *refC = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
211 UChar *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
212 if (re == NULL || refC == NULL || patBuf == NULL) {
213 *status = U_MEMORY_ALLOCATION_ERROR;
214 delete re;
215 uprv_free((void *)refC);
216 uprv_free(patBuf);
217 return NULL;
218 }
219 re->fPatRefCount = refC;
220 *re->fPatRefCount = 1;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800221
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000222 //
223 // Make a copy of the pattern string, so we can return it later if asked.
224 // For compiling the pattern, we will use a read-only UText wrapper
225 // around this local copy, to avoid making even more copies.
226 //
227 re->fPatString = patBuf;
228 re->fPatStringLen = pattern16Length;
229 utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800230
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000231 UText patText = UTEXT_INITIALIZER;
232 utext_openUChars(&patText, patBuf, pattern16Length, status);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800233
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000234 //
235 // Compile the pattern
236 //
237 if (pe != NULL) {
238 re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
239 } else {
240 re->fPat = RegexPattern::compile(&patText, flags, *status);
241 }
242 utext_close(&patText);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800243
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000244 if (U_FAILURE(*status)) {
245 goto ErrorExit;
246 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800247
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000248 //
249 // Create the matcher object
250 //
251 re->fMatcher = re->fPat->matcher(*status);
252 if (U_SUCCESS(*status)) {
253 return (URegularExpression*)re;
254 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800255
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000256ErrorExit:
257 delete re;
258 return NULL;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800259
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000260}
261
262//----------------------------------------------------------------------------------------
263//
264// uregex_close
265//
266//----------------------------------------------------------------------------------------
267U_CAPI void U_EXPORT2
268uregex_close(URegularExpression *re2) {
269 RegularExpression *re = (RegularExpression*)re2;
270 UErrorCode status = U_ZERO_ERROR;
271 if (validateRE(re, FALSE, &status) == FALSE) {
272 return;
273 }
274 delete re;
275}
276
277
278//----------------------------------------------------------------------------------------
279//
280// uregex_clone
281//
282//----------------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800283U_CAPI URegularExpression * U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000284uregex_clone(const URegularExpression *source2, UErrorCode *status) {
285 RegularExpression *source = (RegularExpression*)source2;
286 if (validateRE(source, FALSE, status) == FALSE) {
287 return NULL;
288 }
289
290 RegularExpression *clone = new RegularExpression;
291 if (clone == NULL) {
292 *status = U_MEMORY_ALLOCATION_ERROR;
293 return NULL;
294 }
295
296 clone->fMatcher = source->fPat->matcher(*status);
297 if (U_FAILURE(*status)) {
298 delete clone;
299 return NULL;
300 }
301
302 clone->fPat = source->fPat;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800303 clone->fPatRefCount = source->fPatRefCount;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000304 clone->fPatString = source->fPatString;
305 clone->fPatStringLen = source->fPatStringLen;
306 umtx_atomic_inc(source->fPatRefCount);
307 // Note: fText is not cloned.
308
309 return (URegularExpression*)clone;
310}
311
312
313
314
315//------------------------------------------------------------------------------
316//
317// uregex_pattern
318//
319//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800320U_CAPI const UChar * U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000321uregex_pattern(const URegularExpression *regexp2,
322 int32_t *patLength,
323 UErrorCode *status) {
324 RegularExpression *regexp = (RegularExpression*)regexp2;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800325
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000326 if (validateRE(regexp, FALSE, status) == FALSE) {
327 return NULL;
328 }
329 if (patLength != NULL) {
330 *patLength = regexp->fPatStringLen;
331 }
332 return regexp->fPatString;
333}
334
335
336//------------------------------------------------------------------------------
337//
338// uregex_patternUText
339//
340//------------------------------------------------------------------------------
341U_CAPI UText * U_EXPORT2
342uregex_patternUText(const URegularExpression *regexp2,
343 UErrorCode *status) {
344 RegularExpression *regexp = (RegularExpression*)regexp2;
345 return regexp->fPat->patternText(*status);
346}
347
348
349//------------------------------------------------------------------------------
350//
351// uregex_flags
352//
353//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800354U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000355uregex_flags(const URegularExpression *regexp2, UErrorCode *status) {
356 RegularExpression *regexp = (RegularExpression*)regexp2;
357 if (validateRE(regexp, FALSE, status) == FALSE) {
358 return 0;
359 }
360 int32_t flags = regexp->fPat->flags();
361 return flags;
362}
363
364
365//------------------------------------------------------------------------------
366//
367// uregex_setText
368//
369//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800370U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000371uregex_setText(URegularExpression *regexp2,
372 const UChar *text,
373 int32_t textLength,
374 UErrorCode *status) {
375 RegularExpression *regexp = (RegularExpression*)regexp2;
376 if (validateRE(regexp, FALSE, status) == FALSE) {
377 return;
378 }
379 if (text == NULL || textLength < -1) {
380 *status = U_ILLEGAL_ARGUMENT_ERROR;
381 return;
382 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800383
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000384 if (regexp->fOwnsText && regexp->fText != NULL) {
385 uprv_free((void *)regexp->fText);
386 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800387
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000388 regexp->fText = text;
389 regexp->fTextLength = textLength;
390 regexp->fOwnsText = FALSE;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800391
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000392 UText input = UTEXT_INITIALIZER;
393 utext_openUChars(&input, text, textLength, status);
394 regexp->fMatcher->reset(&input);
395 utext_close(&input); // reset() made a shallow clone, so we don't need this copy
396}
397
398
399//------------------------------------------------------------------------------
400//
401// uregex_setUText
402//
403//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800404U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000405uregex_setUText(URegularExpression *regexp2,
406 UText *text,
407 UErrorCode *status) {
408 RegularExpression *regexp = (RegularExpression*)regexp2;
409 if (validateRE(regexp, FALSE, status) == FALSE) {
410 return;
411 }
412 if (text == NULL) {
413 *status = U_ILLEGAL_ARGUMENT_ERROR;
414 return;
415 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800416
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000417 if (regexp->fOwnsText && regexp->fText != NULL) {
418 uprv_free((void *)regexp->fText);
419 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800420
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000421 regexp->fText = NULL; // only fill it in on request
422 regexp->fTextLength = -1;
423 regexp->fOwnsText = TRUE;
424 regexp->fMatcher->reset(text);
425}
426
427
428
429//------------------------------------------------------------------------------
430//
431// uregex_getText
432//
433//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800434U_CAPI const UChar * U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000435uregex_getText(URegularExpression *regexp2,
436 int32_t *textLength,
437 UErrorCode *status) {
438 RegularExpression *regexp = (RegularExpression*)regexp2;
439 if (validateRE(regexp, FALSE, status) == FALSE) {
440 return NULL;
441 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800442
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000443 if (regexp->fText == NULL) {
444 // need to fill in the text
445 UText *inputText = regexp->fMatcher->inputText();
446 int64_t inputNativeLength = utext_nativeLength(inputText);
447 if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
448 regexp->fText = inputText->chunkContents;
449 regexp->fTextLength = (int32_t)inputNativeLength;
450 regexp->fOwnsText = FALSE; // because the UText owns it
451 } else {
452 UErrorCode lengthStatus = U_ZERO_ERROR;
453 regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
454 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800455
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000456 utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
457 regexp->fText = inputChars;
458 regexp->fOwnsText = TRUE; // should already be set but just in case
459 }
460 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800461
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000462 if (textLength != NULL) {
463 *textLength = regexp->fTextLength;
464 }
465 return regexp->fText;
466}
467
468
469//------------------------------------------------------------------------------
470//
471// uregex_getUText
472//
473//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800474U_CAPI UText * U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000475uregex_getUText(URegularExpression *regexp2,
476 UText *dest,
477 UErrorCode *status) {
478 RegularExpression *regexp = (RegularExpression*)regexp2;
479 if (validateRE(regexp, FALSE, status) == FALSE) {
480 return dest;
481 }
482 return regexp->fMatcher->getInput(dest, *status);
483}
484
485
486//------------------------------------------------------------------------------
487//
488// uregex_refreshUText
489//
490//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800491U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000492uregex_refreshUText(URegularExpression *regexp2,
493 UText *text,
494 UErrorCode *status) {
495 RegularExpression *regexp = (RegularExpression*)regexp2;
496 if (validateRE(regexp, FALSE, status) == FALSE) {
497 return;
498 }
499 regexp->fMatcher->refreshInputText(text, *status);
500}
501
502
503//------------------------------------------------------------------------------
504//
505// uregex_matches
506//
507//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800508U_CAPI UBool U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000509uregex_matches(URegularExpression *regexp2,
510 int32_t startIndex,
511 UErrorCode *status) {
512 return uregex_matches64( regexp2, (int64_t)startIndex, status);
513}
514
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800515U_CAPI UBool U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000516uregex_matches64(URegularExpression *regexp2,
517 int64_t startIndex,
518 UErrorCode *status) {
519 RegularExpression *regexp = (RegularExpression*)regexp2;
520 UBool result = FALSE;
521 if (validateRE(regexp, TRUE, status) == FALSE) {
522 return result;
523 }
524 if (startIndex == -1) {
525 result = regexp->fMatcher->matches(*status);
526 } else {
527 result = regexp->fMatcher->matches(startIndex, *status);
528 }
529 return result;
530}
531
532
533//------------------------------------------------------------------------------
534//
535// uregex_lookingAt
536//
537//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800538U_CAPI UBool U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000539uregex_lookingAt(URegularExpression *regexp2,
540 int32_t startIndex,
541 UErrorCode *status) {
542 return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
543}
544
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800545U_CAPI UBool U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000546uregex_lookingAt64(URegularExpression *regexp2,
547 int64_t startIndex,
548 UErrorCode *status) {
549 RegularExpression *regexp = (RegularExpression*)regexp2;
550 UBool result = FALSE;
551 if (validateRE(regexp, TRUE, status) == FALSE) {
552 return result;
553 }
554 if (startIndex == -1) {
555 result = regexp->fMatcher->lookingAt(*status);
556 } else {
557 result = regexp->fMatcher->lookingAt(startIndex, *status);
558 }
559 return result;
560}
561
562
563
564//------------------------------------------------------------------------------
565//
566// uregex_find
567//
568//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800569U_CAPI UBool U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000570uregex_find(URegularExpression *regexp2,
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800571 int32_t startIndex,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000572 UErrorCode *status) {
573 return uregex_find64( regexp2, (int64_t)startIndex, status);
574}
575
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800576U_CAPI UBool U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000577uregex_find64(URegularExpression *regexp2,
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800578 int64_t startIndex,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000579 UErrorCode *status) {
580 RegularExpression *regexp = (RegularExpression*)regexp2;
581 UBool result = FALSE;
582 if (validateRE(regexp, TRUE, status) == FALSE) {
583 return result;
584 }
585 if (startIndex == -1) {
586 regexp->fMatcher->resetPreserveRegion();
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800587 result = regexp->fMatcher->find(*status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000588 } else {
589 result = regexp->fMatcher->find(startIndex, *status);
590 }
591 return result;
592}
593
594
595//------------------------------------------------------------------------------
596//
597// uregex_findNext
598//
599//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800600U_CAPI UBool U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000601uregex_findNext(URegularExpression *regexp2,
602 UErrorCode *status) {
603 RegularExpression *regexp = (RegularExpression*)regexp2;
604 if (validateRE(regexp, TRUE, status) == FALSE) {
605 return FALSE;
606 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800607 UBool result = regexp->fMatcher->find(*status);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000608 return result;
609}
610
611//------------------------------------------------------------------------------
612//
613// uregex_groupCount
614//
615//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800616U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000617uregex_groupCount(URegularExpression *regexp2,
618 UErrorCode *status) {
619 RegularExpression *regexp = (RegularExpression*)regexp2;
620 if (validateRE(regexp, FALSE, status) == FALSE) {
621 return 0;
622 }
623 int32_t result = regexp->fMatcher->groupCount();
624 return result;
625}
626
627
628//------------------------------------------------------------------------------
629//
Jungshik Shin70f82502016-01-29 00:32:36 -0800630// uregex_groupNumberFromName
631//
632//------------------------------------------------------------------------------
633int32_t
634uregex_groupNumberFromName(URegularExpression *regexp2,
635 const UChar *groupName,
636 int32_t nameLength,
637 UErrorCode *status) {
638 RegularExpression *regexp = (RegularExpression*)regexp2;
639 if (validateRE(regexp, FALSE, status) == FALSE) {
640 return 0;
641 }
642 int32_t result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status);
643 return result;
644}
645
646int32_t
647uregex_groupNumberFromCName(URegularExpression *regexp2,
648 const char *groupName,
649 int32_t nameLength,
650 UErrorCode *status) {
651 RegularExpression *regexp = (RegularExpression*)regexp2;
652 if (validateRE(regexp, FALSE, status) == FALSE) {
653 return 0;
654 }
655 return regexp->fPat->groupNumberFromName(groupName, nameLength, *status);
656}
657
658//------------------------------------------------------------------------------
659//
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000660// uregex_group
661//
662//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800663U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000664uregex_group(URegularExpression *regexp2,
665 int32_t groupNum,
666 UChar *dest,
667 int32_t destCapacity,
668 UErrorCode *status) {
669 RegularExpression *regexp = (RegularExpression*)regexp2;
670 if (validateRE(regexp, TRUE, status) == FALSE) {
671 return 0;
672 }
673 if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
674 *status = U_ILLEGAL_ARGUMENT_ERROR;
675 return 0;
676 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800677
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000678 if (destCapacity == 0 || regexp->fText != NULL) {
679 // If preflighting or if we already have the text as UChars,
Jungshik Shin70f82502016-01-29 00:32:36 -0800680 // this is a little cheaper than extracting from the UText
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800681
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000682 //
683 // Pick up the range of characters from the matcher
684 //
685 int32_t startIx = regexp->fMatcher->start(groupNum, *status);
686 int32_t endIx = regexp->fMatcher->end (groupNum, *status);
687 if (U_FAILURE(*status)) {
688 return 0;
689 }
690
691 //
692 // Trim length based on buffer capacity
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800693 //
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000694 int32_t fullLength = endIx - startIx;
695 int32_t copyLength = fullLength;
696 if (copyLength < destCapacity) {
697 dest[copyLength] = 0;
698 } else if (copyLength == destCapacity) {
699 *status = U_STRING_NOT_TERMINATED_WARNING;
700 } else {
701 copyLength = destCapacity;
702 *status = U_BUFFER_OVERFLOW_ERROR;
703 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800704
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000705 //
706 // Copy capture group to user's buffer
707 //
708 if (copyLength > 0) {
709 u_memcpy(dest, &regexp->fText[startIx], copyLength);
710 }
711 return fullLength;
712 } else {
Jungshik Shin70f82502016-01-29 00:32:36 -0800713 int64_t start = regexp->fMatcher->start64(groupNum, *status);
714 int64_t limit = regexp->fMatcher->end64(groupNum, *status);
715 if (U_FAILURE(*status)) {
716 return 0;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800717 }
Jungshik Shin70f82502016-01-29 00:32:36 -0800718 // Note edge cases:
719 // Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result.
720 // Zero Length Match: start == end.
721 int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status);
722 return length;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000723 }
Jungshik Shin70f82502016-01-29 00:32:36 -0800724
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000725}
726
727
728//------------------------------------------------------------------------------
729//
730// uregex_groupUText
731//
732//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800733U_CAPI UText * U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000734uregex_groupUText(URegularExpression *regexp2,
735 int32_t groupNum,
736 UText *dest,
737 int64_t *groupLength,
738 UErrorCode *status) {
739 RegularExpression *regexp = (RegularExpression*)regexp2;
740 if (validateRE(regexp, TRUE, status) == FALSE) {
741 UErrorCode emptyTextStatus = U_ZERO_ERROR;
742 return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
743 }
744
745 return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
746}
747
748//------------------------------------------------------------------------------
749//
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000750// uregex_start
751//
752//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800753U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000754uregex_start(URegularExpression *regexp2,
755 int32_t groupNum,
756 UErrorCode *status) {
757 return (int32_t)uregex_start64( regexp2, groupNum, status);
758}
759
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800760U_CAPI int64_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000761uregex_start64(URegularExpression *regexp2,
762 int32_t groupNum,
763 UErrorCode *status) {
764 RegularExpression *regexp = (RegularExpression*)regexp2;
765 if (validateRE(regexp, TRUE, status) == FALSE) {
766 return 0;
767 }
768 int32_t result = regexp->fMatcher->start(groupNum, *status);
769 return result;
770}
771
772//------------------------------------------------------------------------------
773//
774// uregex_end
775//
776//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800777U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000778uregex_end(URegularExpression *regexp2,
779 int32_t groupNum,
780 UErrorCode *status) {
781 return (int32_t)uregex_end64( regexp2, groupNum, status);
782}
783
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800784U_CAPI int64_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000785uregex_end64(URegularExpression *regexp2,
786 int32_t groupNum,
787 UErrorCode *status) {
788 RegularExpression *regexp = (RegularExpression*)regexp2;
789 if (validateRE(regexp, TRUE, status) == FALSE) {
790 return 0;
791 }
792 int32_t result = regexp->fMatcher->end(groupNum, *status);
793 return result;
794}
795
796//------------------------------------------------------------------------------
797//
798// uregex_reset
799//
800//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800801U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000802uregex_reset(URegularExpression *regexp2,
803 int32_t index,
804 UErrorCode *status) {
805 uregex_reset64( regexp2, (int64_t)index, status);
806}
807
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800808U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000809uregex_reset64(URegularExpression *regexp2,
810 int64_t index,
811 UErrorCode *status) {
812 RegularExpression *regexp = (RegularExpression*)regexp2;
813 if (validateRE(regexp, TRUE, status) == FALSE) {
814 return;
815 }
816 regexp->fMatcher->reset(index, *status);
817}
818
819
820//------------------------------------------------------------------------------
821//
822// uregex_setRegion
823//
824//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800825U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000826uregex_setRegion(URegularExpression *regexp2,
827 int32_t regionStart,
828 int32_t regionLimit,
829 UErrorCode *status) {
830 uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
831}
832
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800833U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000834uregex_setRegion64(URegularExpression *regexp2,
835 int64_t regionStart,
836 int64_t regionLimit,
837 UErrorCode *status) {
838 RegularExpression *regexp = (RegularExpression*)regexp2;
839 if (validateRE(regexp, TRUE, status) == FALSE) {
840 return;
841 }
842 regexp->fMatcher->region(regionStart, regionLimit, *status);
843}
844
845
846//------------------------------------------------------------------------------
847//
848// uregex_setRegionAndStart
849//
850//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800851U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000852uregex_setRegionAndStart(URegularExpression *regexp2,
853 int64_t regionStart,
854 int64_t regionLimit,
855 int64_t startIndex,
856 UErrorCode *status) {
857 RegularExpression *regexp = (RegularExpression*)regexp2;
858 if (validateRE(regexp, TRUE, status) == FALSE) {
859 return;
860 }
861 regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
862}
863
864//------------------------------------------------------------------------------
865//
866// uregex_regionStart
867//
868//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800869U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000870uregex_regionStart(const URegularExpression *regexp2,
871 UErrorCode *status) {
872 return (int32_t)uregex_regionStart64(regexp2, status);
873}
874
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800875U_CAPI int64_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000876uregex_regionStart64(const URegularExpression *regexp2,
877 UErrorCode *status) {
878 RegularExpression *regexp = (RegularExpression*)regexp2;
879 if (validateRE(regexp, TRUE, status) == FALSE) {
880 return 0;
881 }
882 return regexp->fMatcher->regionStart();
883}
884
885
886//------------------------------------------------------------------------------
887//
888// uregex_regionEnd
889//
890//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800891U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000892uregex_regionEnd(const URegularExpression *regexp2,
893 UErrorCode *status) {
894 return (int32_t)uregex_regionEnd64(regexp2, status);
895}
896
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800897U_CAPI int64_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000898uregex_regionEnd64(const URegularExpression *regexp2,
899 UErrorCode *status) {
900 RegularExpression *regexp = (RegularExpression*)regexp2;
901 if (validateRE(regexp, TRUE, status) == FALSE) {
902 return 0;
903 }
904 return regexp->fMatcher->regionEnd();
905}
906
907
908//------------------------------------------------------------------------------
909//
910// uregex_hasTransparentBounds
911//
912//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800913U_CAPI UBool U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000914uregex_hasTransparentBounds(const URegularExpression *regexp2,
915 UErrorCode *status) {
916 RegularExpression *regexp = (RegularExpression*)regexp2;
917 if (validateRE(regexp, FALSE, status) == FALSE) {
918 return FALSE;
919 }
920 return regexp->fMatcher->hasTransparentBounds();
921}
922
923
924//------------------------------------------------------------------------------
925//
926// uregex_useTransparentBounds
927//
928//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800929U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000930uregex_useTransparentBounds(URegularExpression *regexp2,
931 UBool b,
932 UErrorCode *status) {
933 RegularExpression *regexp = (RegularExpression*)regexp2;
934 if (validateRE(regexp, FALSE, status) == FALSE) {
935 return;
936 }
937 regexp->fMatcher->useTransparentBounds(b);
938}
939
940
941//------------------------------------------------------------------------------
942//
943// uregex_hasAnchoringBounds
944//
945//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800946U_CAPI UBool U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000947uregex_hasAnchoringBounds(const URegularExpression *regexp2,
948 UErrorCode *status) {
949 RegularExpression *regexp = (RegularExpression*)regexp2;
950 if (validateRE(regexp, FALSE, status) == FALSE) {
951 return FALSE;
952 }
953 return regexp->fMatcher->hasAnchoringBounds();
954}
955
956
957//------------------------------------------------------------------------------
958//
959// uregex_useAnchoringBounds
960//
961//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800962U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000963uregex_useAnchoringBounds(URegularExpression *regexp2,
964 UBool b,
965 UErrorCode *status) {
966 RegularExpression *regexp = (RegularExpression*)regexp2;
967 if (validateRE(regexp, FALSE, status) == FALSE) {
968 return;
969 }
970 regexp->fMatcher->useAnchoringBounds(b);
971}
972
973
974//------------------------------------------------------------------------------
975//
976// uregex_hitEnd
977//
978//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800979U_CAPI UBool U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000980uregex_hitEnd(const URegularExpression *regexp2,
981 UErrorCode *status) {
982 RegularExpression *regexp = (RegularExpression*)regexp2;
983 if (validateRE(regexp, TRUE, status) == FALSE) {
984 return FALSE;
985 }
986 return regexp->fMatcher->hitEnd();
987}
988
989
990//------------------------------------------------------------------------------
991//
992// uregex_requireEnd
993//
994//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -0800995U_CAPI UBool U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +0000996uregex_requireEnd(const URegularExpression *regexp2,
997 UErrorCode *status) {
998 RegularExpression *regexp = (RegularExpression*)regexp2;
999 if (validateRE(regexp, TRUE, status) == FALSE) {
1000 return FALSE;
1001 }
1002 return regexp->fMatcher->requireEnd();
1003}
1004
1005
1006//------------------------------------------------------------------------------
1007//
1008// uregex_setTimeLimit
1009//
1010//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001011U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001012uregex_setTimeLimit(URegularExpression *regexp2,
1013 int32_t limit,
1014 UErrorCode *status) {
1015 RegularExpression *regexp = (RegularExpression*)regexp2;
1016 if (validateRE(regexp, FALSE, status)) {
1017 regexp->fMatcher->setTimeLimit(limit, *status);
1018 }
1019}
1020
1021
1022
1023//------------------------------------------------------------------------------
1024//
1025// uregex_getTimeLimit
1026//
1027//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001028U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001029uregex_getTimeLimit(const URegularExpression *regexp2,
1030 UErrorCode *status) {
1031 int32_t retVal = 0;
1032 RegularExpression *regexp = (RegularExpression*)regexp2;
1033 if (validateRE(regexp, FALSE, status)) {
1034 retVal = regexp->fMatcher->getTimeLimit();
1035 }
1036 return retVal;
1037}
1038
1039
1040
1041//------------------------------------------------------------------------------
1042//
1043// uregex_setStackLimit
1044//
1045//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001046U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001047uregex_setStackLimit(URegularExpression *regexp2,
1048 int32_t limit,
1049 UErrorCode *status) {
1050 RegularExpression *regexp = (RegularExpression*)regexp2;
1051 if (validateRE(regexp, FALSE, status)) {
1052 regexp->fMatcher->setStackLimit(limit, *status);
1053 }
1054}
1055
1056
1057
1058//------------------------------------------------------------------------------
1059//
1060// uregex_getStackLimit
1061//
1062//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001063U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001064uregex_getStackLimit(const URegularExpression *regexp2,
1065 UErrorCode *status) {
1066 int32_t retVal = 0;
1067 RegularExpression *regexp = (RegularExpression*)regexp2;
1068 if (validateRE(regexp, FALSE, status)) {
1069 retVal = regexp->fMatcher->getStackLimit();
1070 }
1071 return retVal;
1072}
1073
1074
1075//------------------------------------------------------------------------------
1076//
1077// uregex_setMatchCallback
1078//
1079//------------------------------------------------------------------------------
1080U_CAPI void U_EXPORT2
1081uregex_setMatchCallback(URegularExpression *regexp2,
1082 URegexMatchCallback *callback,
1083 const void *context,
1084 UErrorCode *status) {
1085 RegularExpression *regexp = (RegularExpression*)regexp2;
1086 if (validateRE(regexp, FALSE, status)) {
1087 regexp->fMatcher->setMatchCallback(callback, context, *status);
1088 }
1089}
1090
1091
1092//------------------------------------------------------------------------------
1093//
1094// uregex_getMatchCallback
1095//
1096//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001097U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001098uregex_getMatchCallback(const URegularExpression *regexp2,
1099 URegexMatchCallback **callback,
1100 const void **context,
1101 UErrorCode *status) {
1102 RegularExpression *regexp = (RegularExpression*)regexp2;
1103 if (validateRE(regexp, FALSE, status)) {
1104 regexp->fMatcher->getMatchCallback(*callback, *context, *status);
1105 }
1106}
1107
1108
1109//------------------------------------------------------------------------------
1110//
1111// uregex_setMatchProgressCallback
1112//
1113//------------------------------------------------------------------------------
1114U_CAPI void U_EXPORT2
1115uregex_setFindProgressCallback(URegularExpression *regexp2,
1116 URegexFindProgressCallback *callback,
1117 const void *context,
1118 UErrorCode *status) {
1119 RegularExpression *regexp = (RegularExpression*)regexp2;
1120 if (validateRE(regexp, FALSE, status)) {
1121 regexp->fMatcher->setFindProgressCallback(callback, context, *status);
1122 }
1123}
1124
1125
1126//------------------------------------------------------------------------------
1127//
1128// uregex_getMatchCallback
1129//
1130//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001131U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001132uregex_getFindProgressCallback(const URegularExpression *regexp2,
1133 URegexFindProgressCallback **callback,
1134 const void **context,
1135 UErrorCode *status) {
1136 RegularExpression *regexp = (RegularExpression*)regexp2;
1137 if (validateRE(regexp, FALSE, status)) {
1138 regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
1139 }
1140}
1141
1142
1143//------------------------------------------------------------------------------
1144//
1145// uregex_replaceAll
1146//
1147//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001148U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001149uregex_replaceAll(URegularExpression *regexp2,
1150 const UChar *replacementText,
1151 int32_t replacementLength,
1152 UChar *destBuf,
1153 int32_t destCapacity,
1154 UErrorCode *status) {
1155 RegularExpression *regexp = (RegularExpression*)regexp2;
1156 if (validateRE(regexp, TRUE, status) == FALSE) {
1157 return 0;
1158 }
1159 if (replacementText == NULL || replacementLength < -1 ||
1160 (destBuf == NULL && destCapacity > 0) ||
1161 destCapacity < 0) {
1162 *status = U_ILLEGAL_ARGUMENT_ERROR;
1163 return 0;
1164 }
1165
1166 int32_t len = 0;
1167
1168 uregex_reset(regexp2, 0, status);
1169
1170 // Note: Seperate error code variables for findNext() and appendReplacement()
1171 // are used so that destination buffer overflow errors
1172 // in appendReplacement won't stop findNext() from working.
1173 // appendReplacement() and appendTail() special case incoming buffer
1174 // overflow errors, continuing to return the correct length.
1175 UErrorCode findStatus = *status;
1176 while (uregex_findNext(regexp2, &findStatus)) {
1177 len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
1178 &destBuf, &destCapacity, status);
1179 }
1180 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001181
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001182 if (U_FAILURE(findStatus)) {
1183 // If anything went wrong with the findNext(), make that error trump
1184 // whatever may have happened with the append() operations.
1185 // Errors in findNext() are not expected.
1186 *status = findStatus;
1187 }
1188
1189 return len;
1190}
1191
1192
1193//------------------------------------------------------------------------------
1194//
1195// uregex_replaceAllUText
1196//
1197//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001198U_CAPI UText * U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001199uregex_replaceAllUText(URegularExpression *regexp2,
1200 UText *replacementText,
1201 UText *dest,
1202 UErrorCode *status) {
1203 RegularExpression *regexp = (RegularExpression*)regexp2;
1204 if (validateRE(regexp, TRUE, status) == FALSE) {
1205 return 0;
1206 }
1207 if (replacementText == NULL) {
1208 *status = U_ILLEGAL_ARGUMENT_ERROR;
1209 return 0;
1210 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001211
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001212 dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
1213 return dest;
1214}
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001215
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001216
1217//------------------------------------------------------------------------------
1218//
1219// uregex_replaceFirst
1220//
1221//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001222U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001223uregex_replaceFirst(URegularExpression *regexp2,
1224 const UChar *replacementText,
1225 int32_t replacementLength,
1226 UChar *destBuf,
1227 int32_t destCapacity,
1228 UErrorCode *status) {
1229 RegularExpression *regexp = (RegularExpression*)regexp2;
1230 if (validateRE(regexp, TRUE, status) == FALSE) {
1231 return 0;
1232 }
1233 if (replacementText == NULL || replacementLength < -1 ||
1234 (destBuf == NULL && destCapacity > 0) ||
1235 destCapacity < 0) {
1236 *status = U_ILLEGAL_ARGUMENT_ERROR;
1237 return 0;
1238 }
1239
1240 int32_t len = 0;
1241 UBool findSucceeded;
1242 uregex_reset(regexp2, 0, status);
1243 findSucceeded = uregex_find(regexp2, 0, status);
1244 if (findSucceeded) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001245 len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001246 &destBuf, &destCapacity, status);
1247 }
1248 len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1249
1250 return len;
1251}
1252
1253
1254//------------------------------------------------------------------------------
1255//
1256// uregex_replaceFirstUText
1257//
1258//------------------------------------------------------------------------------
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001259U_CAPI UText * U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001260uregex_replaceFirstUText(URegularExpression *regexp2,
1261 UText *replacementText,
1262 UText *dest,
1263 UErrorCode *status) {
1264 RegularExpression *regexp = (RegularExpression*)regexp2;
1265 if (validateRE(regexp, TRUE, status) == FALSE) {
1266 return 0;
1267 }
1268 if (replacementText == NULL) {
1269 *status = U_ILLEGAL_ARGUMENT_ERROR;
1270 return 0;
1271 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001272
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001273 dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
1274 return dest;
1275}
1276
1277
1278//------------------------------------------------------------------------------
1279//
1280// uregex_appendReplacement
1281//
1282//------------------------------------------------------------------------------
1283
1284U_NAMESPACE_BEGIN
1285//
1286// Dummy class, because these functions need to be friends of class RegexMatcher,
1287// and stand-alone C functions don't work as friends
1288//
1289class RegexCImpl {
1290 public:
1291 inline static int32_t appendReplacement(RegularExpression *regexp,
1292 const UChar *replacementText,
1293 int32_t replacementLength,
1294 UChar **destBuf,
1295 int32_t *destCapacity,
1296 UErrorCode *status);
1297
1298 inline static int32_t appendTail(RegularExpression *regexp,
1299 UChar **destBuf,
1300 int32_t *destCapacity,
1301 UErrorCode *status);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001302
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001303 inline static int32_t split(RegularExpression *regexp,
1304 UChar *destBuf,
1305 int32_t destCapacity,
1306 int32_t *requiredCapacity,
1307 UChar *destFields[],
1308 int32_t destFieldsCapacity,
1309 UErrorCode *status);
1310};
1311
1312U_NAMESPACE_END
1313
1314
1315
1316static const UChar BACKSLASH = 0x5c;
1317static const UChar DOLLARSIGN = 0x24;
Jungshik Shin70f82502016-01-29 00:32:36 -08001318static const UChar LEFTBRACKET = 0x7b;
1319static const UChar RIGHTBRACKET = 0x7d;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001320
1321//
1322// Move a character to an output buffer, with bounds checking on the index.
1323// Index advances even if capacity is exceeded, for preflight size computations.
1324// This little sequence is used a LOT.
1325//
1326static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
1327 if (*idx < bufCapacity) {
1328 buf[*idx] = c;
1329 }
1330 (*idx)++;
1331}
1332
1333
1334//
1335// appendReplacement, the actual implementation.
1336//
1337int32_t RegexCImpl::appendReplacement(RegularExpression *regexp,
1338 const UChar *replacementText,
1339 int32_t replacementLength,
1340 UChar **destBuf,
1341 int32_t *destCapacity,
1342 UErrorCode *status) {
1343
1344 // If we come in with a buffer overflow error, don't suppress the operation.
1345 // A series of appendReplacements, appendTail need to correctly preflight
1346 // the buffer size when an overflow happens somewhere in the middle.
1347 UBool pendingBufferOverflow = FALSE;
1348 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1349 pendingBufferOverflow = TRUE;
1350 *status = U_ZERO_ERROR;
1351 }
1352
1353 //
1354 // Validate all paramters
1355 //
1356 if (validateRE(regexp, TRUE, status) == FALSE) {
1357 return 0;
1358 }
1359 if (replacementText == NULL || replacementLength < -1 ||
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001360 destCapacity == NULL || destBuf == NULL ||
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001361 (*destBuf == NULL && *destCapacity > 0) ||
1362 *destCapacity < 0) {
1363 *status = U_ILLEGAL_ARGUMENT_ERROR;
1364 return 0;
1365 }
1366
1367 RegexMatcher *m = regexp->fMatcher;
1368 if (m->fMatch == FALSE) {
1369 *status = U_REGEX_INVALID_STATE;
1370 return 0;
1371 }
1372
1373 UChar *dest = *destBuf;
1374 int32_t capacity = *destCapacity;
1375 int32_t destIdx = 0;
1376 int32_t i;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001377
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001378 // If it wasn't supplied by the caller, get the length of the replacement text.
1379 // TODO: slightly smarter logic in the copy loop could watch for the NUL on
1380 // the fly and avoid this step.
1381 if (replacementLength == -1) {
1382 replacementLength = u_strlen(replacementText);
1383 }
1384
1385 // Copy input string from the end of previous match to start of current match
1386 if (regexp->fText != NULL) {
1387 int32_t matchStart;
1388 int32_t lastMatchEnd;
1389 if (UTEXT_USES_U16(m->fInputText)) {
1390 lastMatchEnd = (int32_t)m->fLastMatchEnd;
1391 matchStart = (int32_t)m->fMatchStart;
1392 } else {
1393 // !!!: Would like a better way to do this!
Jungshik Shin70f82502016-01-29 00:32:36 -08001394 UErrorCode tempStatus = U_ZERO_ERROR;
1395 lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus);
1396 tempStatus = U_ZERO_ERROR;
1397 matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus);
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001398 }
1399 for (i=lastMatchEnd; i<matchStart; i++) {
1400 appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001401 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001402 } else {
1403 UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
1404 destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
1405 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity),
1406 &possibleOverflowError);
1407 }
1408 U_ASSERT(destIdx >= 0);
1409
1410 // scan the replacement text, looking for substitutions ($n) and \escapes.
1411 int32_t replIdx = 0;
Jungshik Shin70f82502016-01-29 00:32:36 -08001412 while (replIdx < replacementLength && U_SUCCESS(*status)) {
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001413 UChar c = replacementText[replIdx];
1414 replIdx++;
1415 if (c != DOLLARSIGN && c != BACKSLASH) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001416 // Common case, no substitution, no escaping,
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001417 // just copy the char to the dest buf.
1418 appendToBuf(c, &destIdx, dest, capacity);
1419 continue;
1420 }
1421
1422 if (c == BACKSLASH) {
1423 // Backslash Escape. Copy the following char out without further checks.
1424 // Note: Surrogate pairs don't need any special handling
1425 // The second half wont be a '$' or a '\', and
1426 // will move to the dest normally on the next
1427 // loop iteration.
1428 if (replIdx >= replacementLength) {
1429 break;
1430 }
1431 c = replacementText[replIdx];
1432
1433 if (c==0x55/*U*/ || c==0x75/*u*/) {
1434 // We have a \udddd or \Udddddddd escape sequence.
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001435 UChar32 escapedChar =
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001436 u_unescapeAt(uregex_ucstr_unescape_charAt,
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001437 &replIdx, // Index is updated by unescapeAt
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001438 replacementLength, // Length of replacement text
1439 (void *)replacementText);
1440
1441 if (escapedChar != (UChar32)0xFFFFFFFF) {
1442 if (escapedChar <= 0xffff) {
1443 appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
1444 } else {
1445 appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
1446 appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
1447 }
1448 continue;
1449 }
1450 // Note: if the \u escape was invalid, just fall through and
1451 // treat it as a plain \<anything> escape.
1452 }
1453
1454 // Plain backslash escape. Just put out the escaped character.
1455 appendToBuf(c, &destIdx, dest, capacity);
1456
1457 replIdx++;
1458 continue;
1459 }
1460
Jungshik Shin70f82502016-01-29 00:32:36 -08001461 // We've got a $. Pick up the following capture group name or number.
1462 // For numbers, consume only digits that produce a valid capture group for the pattern.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001463
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001464 int32_t groupNum = 0;
Jungshik Shin70f82502016-01-29 00:32:36 -08001465 U_ASSERT(c == DOLLARSIGN);
1466 UChar32 c32;
1467 U16_GET(replacementText, 0, replIdx, replacementLength, c32);
1468 if (u_isdigit(c32)) {
1469 int32_t numDigits = 0;
1470 int32_t numCaptureGroups = m->fPattern->fGroupMap->size();
1471 for (;;) {
1472 if (replIdx >= replacementLength) {
1473 break;
1474 }
1475 U16_GET(replacementText, 0, replIdx, replacementLength, c32);
1476 if (u_isdigit(c32) == FALSE) {
1477 break;
1478 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001479
Jungshik Shin70f82502016-01-29 00:32:36 -08001480 int32_t digitVal = u_charDigitValue(c32);
1481 if (groupNum * 10 + digitVal <= numCaptureGroups) {
1482 groupNum = groupNum * 10 + digitVal;
1483 U16_FWD_1(replacementText, replIdx, replacementLength);
1484 numDigits++;
1485 } else {
1486 if (numDigits == 0) {
1487 *status = U_INDEX_OUTOFBOUNDS_ERROR;
1488 }
1489 break;
1490 }
1491 }
1492 } else if (c32 == LEFTBRACKET) {
1493 // Scan for Named Capture Group, ${name}.
1494 UnicodeString groupName;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001495 U16_FWD_1(replacementText, replIdx, replacementLength);
Jungshik Shin70f82502016-01-29 00:32:36 -08001496 while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) {
1497 if (replIdx >= replacementLength) {
1498 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1499 break;
1500 }
1501 U16_NEXT(replacementText, replIdx, replacementLength, c32);
1502 if ((c32 >= 0x41 && c32 <= 0x5a) || // A..Z
1503 (c32 >= 0x61 && c32 <= 0x7a) || // a..z
1504 (c32 >= 0x31 && c32 <= 0x39)) { // 0..9
1505 groupName.append(c32);
1506 } else if (c32 == RIGHTBRACKET) {
1507 groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
1508 if (groupNum == 0) {
1509 // Name not defined by pattern.
1510 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1511 }
1512 } else {
1513 // Character was something other than a name char or a closing '}'
1514 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1515 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001516 }
Jungshik Shin70f82502016-01-29 00:32:36 -08001517 } else {
1518 // $ not followed by {name} or digits.
1519 *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001520 }
1521
1522
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001523 // Finally, append the capture group data to the destination.
Jungshik Shin70f82502016-01-29 00:32:36 -08001524 if (U_SUCCESS(*status)) {
1525 destIdx += uregex_group((URegularExpression*)regexp, groupNum,
1526 dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
1527 if (*status == U_BUFFER_OVERFLOW_ERROR) {
1528 // Ignore buffer overflow when extracting the group. We need to
1529 // continue on to get full size of the untruncated result. We will
1530 // raise our own buffer overflow error at the end.
1531 *status = U_ZERO_ERROR;
1532 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001533 }
1534
1535 if (U_FAILURE(*status)) {
Jungshik Shin70f82502016-01-29 00:32:36 -08001536 // bad group number or name.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001537 break;
1538 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001539 }
1540
1541 //
1542 // Nul Terminate the dest buffer if possible.
1543 // Set the appropriate buffer overflow or not terminated error, if needed.
1544 //
1545 if (destIdx < capacity) {
1546 dest[destIdx] = 0;
Jungshik Shin70f82502016-01-29 00:32:36 -08001547 } else if (U_SUCCESS(*status)) {
1548 if (destIdx == *destCapacity) {
1549 *status = U_STRING_NOT_TERMINATED_WARNING;
1550 } else {
1551 *status = U_BUFFER_OVERFLOW_ERROR;
1552 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001553 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001554
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001555 //
1556 // Return an updated dest buffer and capacity to the caller.
1557 //
1558 if (destIdx > 0 && *destCapacity > 0) {
1559 if (destIdx < capacity) {
1560 *destBuf += destIdx;
1561 *destCapacity -= destIdx;
1562 } else {
1563 *destBuf += capacity;
1564 *destCapacity = 0;
1565 }
1566 }
1567
1568 // If we came in with a buffer overflow, make sure we go out with one also.
1569 // (A zero length match right at the end of the previous match could
1570 // make this function succeed even though a previous call had overflowed the buf)
1571 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1572 *status = U_BUFFER_OVERFLOW_ERROR;
1573 }
1574
1575 return destIdx;
1576}
1577
1578//
1579// appendReplacement the actual API function,
1580//
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001581U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001582uregex_appendReplacement(URegularExpression *regexp2,
1583 const UChar *replacementText,
1584 int32_t replacementLength,
1585 UChar **destBuf,
1586 int32_t *destCapacity,
1587 UErrorCode *status) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001588
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001589 RegularExpression *regexp = (RegularExpression*)regexp2;
1590 return RegexCImpl::appendReplacement(
1591 regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1592}
1593
1594//
1595// uregex_appendReplacementUText...can just use the normal C++ method
1596//
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001597U_CAPI void U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001598uregex_appendReplacementUText(URegularExpression *regexp2,
1599 UText *replText,
1600 UText *dest,
1601 UErrorCode *status) {
1602 RegularExpression *regexp = (RegularExpression*)regexp2;
1603 regexp->fMatcher->appendReplacement(dest, replText, *status);
1604}
1605
1606
1607//------------------------------------------------------------------------------
1608//
1609// uregex_appendTail
1610//
1611//------------------------------------------------------------------------------
1612int32_t RegexCImpl::appendTail(RegularExpression *regexp,
1613 UChar **destBuf,
1614 int32_t *destCapacity,
1615 UErrorCode *status)
1616{
1617
1618 // If we come in with a buffer overflow error, don't suppress the operation.
1619 // A series of appendReplacements, appendTail need to correctly preflight
1620 // the buffer size when an overflow happens somewhere in the middle.
1621 UBool pendingBufferOverflow = FALSE;
1622 if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1623 pendingBufferOverflow = TRUE;
1624 *status = U_ZERO_ERROR;
1625 }
1626
1627 if (validateRE(regexp, TRUE, status) == FALSE) {
1628 return 0;
1629 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001630
1631 if (destCapacity == NULL || destBuf == NULL ||
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001632 (*destBuf == NULL && *destCapacity > 0) ||
1633 *destCapacity < 0)
1634 {
1635 *status = U_ILLEGAL_ARGUMENT_ERROR;
1636 return 0;
1637 }
1638
1639 RegexMatcher *m = regexp->fMatcher;
1640
1641 int32_t destIdx = 0;
1642 int32_t destCap = *destCapacity;
1643 UChar *dest = *destBuf;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001644
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001645 if (regexp->fText != NULL) {
1646 int32_t srcIdx;
1647 int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
1648 if (nativeIdx == -1) {
1649 srcIdx = 0;
1650 } else if (UTEXT_USES_U16(m->fInputText)) {
1651 srcIdx = (int32_t)nativeIdx;
1652 } else {
1653 UErrorCode status = U_ZERO_ERROR;
1654 srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
1655 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001656
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001657 for (;;) {
1658 U_ASSERT(destIdx >= 0);
1659
1660 if (srcIdx == regexp->fTextLength) {
1661 break;
1662 }
1663 UChar c = regexp->fText[srcIdx];
1664 if (c == 0 && regexp->fTextLength == -1) {
1665 regexp->fTextLength = srcIdx;
1666 break;
1667 }
1668
1669 if (destIdx < destCap) {
1670 dest[destIdx] = c;
1671 } else {
1672 // We've overflowed the dest buffer.
1673 // If the total input string length is known, we can
1674 // compute the total buffer size needed without scanning through the string.
1675 if (regexp->fTextLength > 0) {
1676 destIdx += (regexp->fTextLength - srcIdx);
1677 break;
1678 }
1679 }
1680 srcIdx++;
1681 destIdx++;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001682 }
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001683 } else {
1684 int64_t srcIdx;
1685 if (m->fMatch) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001686 // The most recent call to find() succeeded.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001687 srcIdx = m->fMatchEnd;
1688 } else {
1689 // The last call to find() on this matcher failed().
1690 // Look back to the end of the last find() that succeeded for src index.
1691 srcIdx = m->fLastMatchEnd;
1692 if (srcIdx == -1) {
1693 // There has been no successful match with this matcher.
1694 // We want to copy the whole string.
1695 srcIdx = 0;
1696 }
1697 }
1698
1699 destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
1700 }
1701
1702 //
1703 // NUL terminate the output string, if possible, otherwise issue the
1704 // appropriate error or warning.
1705 //
1706 if (destIdx < destCap) {
1707 dest[destIdx] = 0;
1708 } else if (destIdx == destCap) {
1709 *status = U_STRING_NOT_TERMINATED_WARNING;
1710 } else {
1711 *status = U_BUFFER_OVERFLOW_ERROR;
1712 }
1713
1714 //
1715 // Update the user's buffer ptr and capacity vars to reflect the
1716 // amount used.
1717 //
1718 if (destIdx < destCap) {
1719 *destBuf += destIdx;
1720 *destCapacity -= destIdx;
1721 } else if (*destBuf != NULL) {
1722 *destBuf += destCap;
1723 *destCapacity = 0;
1724 }
1725
1726 if (pendingBufferOverflow && U_SUCCESS(*status)) {
1727 *status = U_BUFFER_OVERFLOW_ERROR;
1728 }
1729
1730 return destIdx;
1731}
1732
1733
1734//
1735// appendTail the actual API function
1736//
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001737U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001738uregex_appendTail(URegularExpression *regexp2,
1739 UChar **destBuf,
1740 int32_t *destCapacity,
1741 UErrorCode *status) {
1742 RegularExpression *regexp = (RegularExpression*)regexp2;
1743 return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1744}
1745
1746
1747//
1748// uregex_appendTailUText...can just use the normal C++ method
1749//
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001750U_CAPI UText * U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001751uregex_appendTailUText(URegularExpression *regexp2,
1752 UText *dest,
1753 UErrorCode *status) {
1754 RegularExpression *regexp = (RegularExpression*)regexp2;
1755 return regexp->fMatcher->appendTail(dest, *status);
1756}
1757
1758
1759//------------------------------------------------------------------------------
1760//
1761// copyString Internal utility to copy a string to an output buffer,
1762// while managing buffer overflow and preflight size
1763// computation. NUL termination is added to destination,
1764// and the NUL is counted in the output size.
1765//
1766//------------------------------------------------------------------------------
1767#if 0
1768static void copyString(UChar *destBuffer, // Destination buffer.
1769 int32_t destCapacity, // Total capacity of dest buffer
1770 int32_t *destIndex, // Index into dest buffer. Updated on return.
1771 // Update not clipped to destCapacity.
1772 const UChar *srcPtr, // Pointer to source string
1773 int32_t srcLen) // Source string len.
1774{
1775 int32_t si;
1776 int32_t di = *destIndex;
1777 UChar c;
1778
1779 for (si=0; si<srcLen; si++) {
1780 c = srcPtr[si];
1781 if (di < destCapacity) {
1782 destBuffer[di] = c;
1783 di++;
1784 } else {
1785 di += srcLen - si;
1786 break;
1787 }
1788 }
1789 if (di<destCapacity) {
1790 destBuffer[di] = 0;
1791 }
1792 di++;
1793 *destIndex = di;
1794}
1795#endif
1796
1797//------------------------------------------------------------------------------
1798//
1799// uregex_split
1800//
1801//------------------------------------------------------------------------------
1802int32_t RegexCImpl::split(RegularExpression *regexp,
1803 UChar *destBuf,
1804 int32_t destCapacity,
1805 int32_t *requiredCapacity,
1806 UChar *destFields[],
1807 int32_t destFieldsCapacity,
1808 UErrorCode *status) {
1809 //
1810 // Reset for the input text
1811 //
1812 regexp->fMatcher->reset();
1813 UText *inputText = regexp->fMatcher->fInputText;
1814 int64_t nextOutputStringStart = 0;
1815 int64_t inputLen = regexp->fMatcher->fInputLength;
1816 if (inputLen == 0) {
1817 return 0;
1818 }
1819
1820 //
1821 // Loop through the input text, searching for the delimiter pattern
1822 //
1823 int32_t i; // Index of the field being processed.
1824 int32_t destIdx = 0; // Next available position in destBuf;
1825 int32_t numCaptureGroups = regexp->fMatcher->groupCount();
1826 UErrorCode tStatus = U_ZERO_ERROR; // Want to ignore any buffer overflow errors so that the strings are still counted
1827 for (i=0; ; i++) {
1828 if (i>=destFieldsCapacity-1) {
1829 // There are one or zero output strings left.
1830 // Fill the last output string with whatever is left from the input, then exit the loop.
1831 // ( i will be == destFieldsCapacity if we filled the output array while processing
1832 // capture groups of the delimiter expression, in which case we will discard the
1833 // last capture group saved in favor of the unprocessed remainder of the
1834 // input string.)
1835 if (inputLen > nextOutputStringStart) {
1836 if (i != destFieldsCapacity-1) {
1837 // No fields are left. Recycle the last one for holding the trailing part of
1838 // the input string.
1839 i = destFieldsCapacity-1;
1840 destIdx = (int32_t)(destFields[i] - destFields[0]);
1841 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001842
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001843 destFields[i] = &destBuf[destIdx];
1844 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1845 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1846 }
1847 break;
1848 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001849
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001850 if (regexp->fMatcher->find()) {
1851 // We found another delimiter. Move everything from where we started looking
1852 // up until the start of the delimiter into the next output string.
1853 destFields[i] = &destBuf[destIdx];
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001854
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001855 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
1856 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
1857 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1858 tStatus = U_ZERO_ERROR;
1859 } else {
1860 *status = tStatus;
1861 }
1862 nextOutputStringStart = regexp->fMatcher->fMatchEnd;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001863
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001864 // If the delimiter pattern has capturing parentheses, the captured
1865 // text goes out into the next n destination strings.
1866 int32_t groupNum;
1867 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1868 // If we've run out of output string slots, bail out.
1869 if (i==destFieldsCapacity-1) {
1870 break;
1871 }
1872 i++;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001873
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001874 // Set up to extract the capture group contents into the dest buffer.
1875 destFields[i] = &destBuf[destIdx];
1876 tStatus = U_ZERO_ERROR;
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001877 int32_t t = uregex_group((URegularExpression*)regexp,
1878 groupNum,
1879 destFields[i],
1880 REMAINING_CAPACITY(destIdx, destCapacity),
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001881 &tStatus);
1882 destIdx += t + 1; // Record the space used in the output string buffer.
1883 // +1 for the NUL that terminates the string.
1884 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1885 tStatus = U_ZERO_ERROR;
1886 } else {
1887 *status = tStatus;
1888 }
1889 }
1890
1891 if (nextOutputStringStart == inputLen) {
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001892 // The delimiter was at the end of the string.
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001893 // Output an empty string, and then we are done.
1894 if (destIdx < destCapacity) {
1895 destBuf[destIdx] = 0;
1896 }
1897 if (i < destFieldsCapacity-1) {
1898 ++i;
1899 }
1900 if (destIdx < destCapacity) {
1901 destFields[i] = destBuf + destIdx;
1902 }
1903 ++destIdx;
1904 break;
1905 }
1906
1907 }
1908 else
1909 {
1910 // We ran off the end of the input while looking for the next delimiter.
1911 // All the remaining text goes into the current output string.
1912 destFields[i] = &destBuf[destIdx];
1913 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1914 &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1915 break;
1916 }
1917 }
1918
1919 // Zero out any unused portion of the destFields array
1920 int j;
1921 for (j=i+1; j<destFieldsCapacity; j++) {
1922 destFields[j] = NULL;
1923 }
1924
1925 if (requiredCapacity != NULL) {
1926 *requiredCapacity = destIdx;
1927 }
1928 if (destIdx > destCapacity) {
1929 *status = U_BUFFER_OVERFLOW_ERROR;
1930 }
1931 return i+1;
1932}
1933
1934//
1935// uregex_split The actual API function
1936//
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001937U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001938uregex_split(URegularExpression *regexp2,
1939 UChar *destBuf,
1940 int32_t destCapacity,
1941 int32_t *requiredCapacity,
1942 UChar *destFields[],
1943 int32_t destFieldsCapacity,
1944 UErrorCode *status) {
1945 RegularExpression *regexp = (RegularExpression*)regexp2;
1946 if (validateRE(regexp, TRUE, status) == FALSE) {
1947 return 0;
1948 }
1949 if ((destBuf == NULL && destCapacity > 0) ||
1950 destCapacity < 0 ||
1951 destFields == NULL ||
1952 destFieldsCapacity < 1 ) {
1953 *status = U_ILLEGAL_ARGUMENT_ERROR;
1954 return 0;
1955 }
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001956
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001957 return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
1958}
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001959
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001960
1961//
1962// uregex_splitUText...can just use the normal C++ method
1963//
Jungshik Shin (jungshik at google)0f8746a2015-01-08 15:46:45 -08001964U_CAPI int32_t U_EXPORT2
jshin@chromium.org6f31ac32014-03-26 22:15:14 +00001965uregex_splitUText(URegularExpression *regexp2,
1966 UText *destFields[],
1967 int32_t destFieldsCapacity,
1968 UErrorCode *status) {
1969 RegularExpression *regexp = (RegularExpression*)regexp2;
1970 return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
1971}
1972
1973
1974#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1975