@@ -153,87 +153,148 @@ function menu( $items, $default = null, $title = 'Choose an item' ) {
153
153
}
154
154
155
155
/**
156
- * Attempts an encoding-safe way of getting string length. If mb_string extensions aren't
157
- * installed , falls back to basic strlen if no encoding is present
156
+ * Attempts an encoding-safe way of getting string length. If intl extension or PCRE with '\X' or mb_string extension aren't
157
+ * available , falls back to basic strlen.
158
158
*
159
159
* @param string $str The string to check.
160
160
* @param string|bool $encoding Optional. The encoding of the string. Default false.
161
161
* @return int Numeric value that represents the string's length
162
162
*/
163
163
function safe_strlen ( $ str , $ encoding = false ) {
164
+ // Allow for selective testings - "1" bit set tests grapheme_strlen(), "2" preg_match_all( '/\X/u' ), "4" mb_strlen(), "other" strlen().
165
+ $ test_safe_strlen = getenv ( 'PHP_CLI_TOOLS_TEST_SAFE_STRLEN ' );
166
+
167
+ // Assume UTF-8 if no encoding given - `grapheme_strlen()` will return null if given non-UTF-8 string.
168
+ if ( ( ! $ encoding || 'UTF-8 ' === $ encoding ) && can_use_icu () && null !== ( $ length = grapheme_strlen ( $ str ) ) ) {
169
+ if ( ! $ test_safe_strlen || ( $ test_safe_strlen & 1 ) ) {
170
+ return $ length ;
171
+ }
172
+ }
173
+ // Assume UTF-8 if no encoding given - `preg_match_all()` will return false if given non-UTF-8 string.
174
+ if ( ( ! $ encoding || 'UTF-8 ' === $ encoding ) && can_use_pcre_x () && false !== ( $ length = preg_match_all ( '/\X/u ' , $ str , $ dummy /*needed for PHP 5.3*/ ) ) ) {
175
+ if ( ! $ test_safe_strlen || ( $ test_safe_strlen & 2 ) ) {
176
+ return $ length ;
177
+ }
178
+ }
179
+ // Legacy encodings and old PHPs will reach here.
164
180
if ( function_exists ( 'mb_strlen ' ) && ( $ encoding || function_exists ( 'mb_detect_encoding ' ) ) ) {
165
181
if ( ! $ encoding ) {
166
182
$ encoding = mb_detect_encoding ( $ str , null , true /*strict*/ );
167
183
}
168
- $ length = mb_strlen ( $ str , $ encoding );
169
- } else {
170
- // iconv will return PHP notice if non-ascii characters are present in input string
171
- $ str = iconv ( $ encoding ? $ encoding : 'ASCII ' , 'ASCII ' , $ str );
172
-
173
- $ length = strlen ( $ str );
184
+ $ length = mb_strlen ( $ str , $ encoding );
185
+ if ( 'UTF-8 ' === $ encoding ) {
186
+ // Subtract combining characters.
187
+ $ length -= preg_match_all ( get_unicode_regexs ( 'm ' ), $ str , $ dummy /*needed for PHP 5.3*/ );
188
+ }
189
+ if ( ! $ test_safe_strlen || ( $ test_safe_strlen & 4 ) ) {
190
+ return $ length ;
191
+ }
174
192
}
175
-
176
- return $ length ;
193
+ return strlen ( $ str );
177
194
}
178
195
179
196
/**
180
- * Attempts an encoding-safe way of getting a substring. If mb_string extensions aren't
181
- * installed , falls back to ascii substring if no encoding is present
197
+ * Attempts an encoding-safe way of getting a substring. If intl extension or PCRE with '\X' or mb_string extension aren't
198
+ * available , falls back to substr().
182
199
*
183
200
* @param string $str The input string.
184
201
* @param int $start The starting position of the substring.
185
- * @param int|bool|null $length Optional. Maximum length of the substring. Default false.
186
- * @param int|bool $is_width Optional. If set and encoding is UTF-8, $length is interpreted as spacing width. Default false.
202
+ * @param int|bool|null $length Optional, unless $is_width is set . Maximum length of the substring. Default false. Negative not supported .
203
+ * @param int|bool $is_width Optional. If set and encoding is UTF-8, $length (which must be specified) is interpreted as spacing width. Default false.
187
204
* @param string|bool $encoding Optional. The encoding of the string. Default false.
188
- * @return string Substring of string specified by start and length parameters
205
+ * @return bool| string False if given unsupported args, otherwise substring of string specified by start and length parameters
189
206
*/
190
207
function safe_substr ( $ str , $ start , $ length = false , $ is_width = false , $ encoding = false ) {
208
+ // Negative $length or $is_width and $length not specified not supported.
209
+ if ( $ length < 0 || ( $ is_width && ( null === $ length || false === $ length ) ) ) {
210
+ return false ;
211
+ }
212
+ $ have_safe_strlen = false ;
191
213
// PHP 5.3 substr takes false as full length, PHP > 5.3 takes null - for compat. do `safe_strlen()`.
192
214
if ( null === $ length || false === $ length ) {
193
215
$ length = safe_strlen ( $ str , $ encoding );
216
+ $ have_safe_strlen = true ;
217
+ }
218
+
219
+ // Allow for selective testings - "1" bit set tests grapheme_substr(), "2" preg_match( '/\X/' ), "4" mb_substr(), "8" substr().
220
+ $ test_safe_substr = getenv ( 'PHP_CLI_TOOLS_TEST_SAFE_SUBSTR ' );
221
+
222
+ // Assume UTF-8 if no encoding given - `grapheme_substr()` will return false (not null like `grapheme_strlen()`) if given non-UTF-8 string.
223
+ if ( ( ! $ encoding || 'UTF-8 ' === $ encoding ) && can_use_icu () && false !== ( $ try = grapheme_substr ( $ str , $ start , $ length ) ) ) {
224
+ if ( ! $ test_safe_substr || ( $ test_safe_substr & 1 ) ) {
225
+ return $ is_width ? _safe_substr_eaw ( $ try , $ length ) : $ try ;
226
+ }
227
+ }
228
+ // Assume UTF-8 if no encoding given - `preg_match()` will return false if given non-UTF-8 string.
229
+ if ( ( ! $ encoding || 'UTF-8 ' === $ encoding ) && can_use_pcre_x () ) {
230
+ if ( $ start < 0 ) {
231
+ $ start = max ( $ start + ( $ have_safe_strlen ? $ length : safe_strlen ( $ str , $ encoding ) ), 0 );
232
+ }
233
+ if ( $ start ) {
234
+ if ( preg_match ( '/^\X{ ' . $ start . '}(\X{0, ' . $ length . '})/u ' , $ str , $ matches ) ) {
235
+ if ( ! $ test_safe_substr || ( $ test_safe_substr & 2 ) ) {
236
+ return $ is_width ? _safe_substr_eaw ( $ matches [1 ], $ length ) : $ matches [1 ];
237
+ }
238
+ }
239
+ } else {
240
+ if ( preg_match ( '/^\X{0, ' . $ length . '}/u ' , $ str , $ matches ) ) {
241
+ if ( ! $ test_safe_substr || ( $ test_safe_substr & 2 ) ) {
242
+ return $ is_width ? _safe_substr_eaw ( $ matches [0 ], $ length ) : $ matches [0 ];
243
+ }
244
+ }
245
+ }
194
246
}
247
+ // Legacy encodings and old PHPs will reach here.
195
248
if ( function_exists ( 'mb_substr ' ) && ( $ encoding || function_exists ( 'mb_detect_encoding ' ) ) ) {
196
249
if ( ! $ encoding ) {
197
250
$ encoding = mb_detect_encoding ( $ str , null , true /*strict*/ );
198
251
}
199
- $ substr = mb_substr ( $ str , $ start , $ length , $ encoding );
200
-
201
- if ( $ is_width && 'UTF-8 ' === $ encoding ) {
202
- // Set the East Asian Width regex.
203
- $ eaw_regex = get_unicode_regexs ( 'eaw ' );
204
- // If there's any East Asian double-width chars...
205
- if ( preg_match ( $ eaw_regex , $ substr ) ) {
206
- // Note that if the length ends in the middle of a double-width char, the char is excluded, not included.
207
-
208
- // See if it's all EAW - the most likely case.
209
- if ( preg_match_all ( $ eaw_regex , $ substr , $ dummy /*needed for PHP 5.3*/ ) === $ length ) {
210
- // Just halve the length so (rounded down to a minimum of 1).
211
- $ substr = mb_substr ( $ substr , 0 , max ( (int ) ( $ length / 2 ), 1 ), $ encoding );
212
- } else {
213
- // Explode string into an array of UTF-8 chars. Based on core `_mb_substr()` in "wp-includes/compat.php".
214
- $ chars = preg_split ( '/([\x00-\x7f\xc2-\xf4][^\x00-\x7f\xc2-\xf4]*)/ ' , $ substr , $ length + 1 , PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
215
- $ cnt = min ( count ( $ chars ), $ length );
216
- $ width = $ length ;
217
-
218
- for ( $ length = 0 ; $ length < $ cnt && $ width > 0 ; $ length ++ ) {
219
- $ width -= preg_match ( $ eaw_regex , $ chars [ $ length ] ) ? 2 : 1 ;
220
- }
221
- // Round down to a minimum of 1.
222
- if ( $ width < 0 && $ length > 1 ) {
223
- $ length --;
224
- }
225
- return join ( '' , array_slice ( $ chars , 0 , $ length ) );
226
- }
227
- }
252
+ // Bug: not adjusting for combining chars.
253
+ $ try = mb_substr ( $ str , $ start , $ length , $ encoding );
254
+ if ( 'UTF-8 ' === $ encoding && $ is_width ) {
255
+ $ try = _safe_substr_eaw ( $ try , $ length );
256
+ }
257
+ if ( ! $ test_safe_substr || ( $ test_safe_substr & 4 ) ) {
258
+ return $ try ;
228
259
}
229
- } else {
230
- // iconv will return PHP notice if non-ascii characters are present in input string
231
- $ str = iconv ( $ encoding ? $ encoding : 'ASCII ' , 'ASCII ' , $ str );
232
-
233
- $ substr = substr ( $ str , $ start , $ length );
234
260
}
261
+ return substr ( $ str , $ start , $ length );
262
+ }
263
+
264
+ /**
265
+ * Internal function used by `safe_substr()` to adjust for East Asian double-width chars.
266
+ *
267
+ * @return string
268
+ */
269
+ function _safe_substr_eaw ( $ str , $ length ) {
270
+ // Set the East Asian Width regex.
271
+ $ eaw_regex = get_unicode_regexs ( 'eaw ' );
235
272
236
- return $ substr ;
273
+ // If there's any East Asian double-width chars...
274
+ if ( preg_match ( $ eaw_regex , $ str ) ) {
275
+ // Note that if the length ends in the middle of a double-width char, the char is excluded, not included.
276
+
277
+ // See if it's all EAW.
278
+ if ( preg_match_all ( $ eaw_regex , $ str , $ dummy /*needed for PHP 5.3*/ ) === $ length ) {
279
+ // Just halve the length so (rounded down to a minimum of 1).
280
+ $ str = mb_substr ( $ str , 0 , max ( (int ) ( $ length / 2 ), 1 ), 'UTF-8 ' );
281
+ } else {
282
+ // Explode string into an array of UTF-8 chars. Based on core `_mb_substr()` in "wp-includes/compat.php".
283
+ $ chars = preg_split ( '/([\x00-\x7f\xc2-\xf4][^\x00-\x7f\xc2-\xf4]*)/ ' , $ str , $ length + 1 , PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
284
+ $ cnt = min ( count ( $ chars ), $ length );
285
+ $ width = $ length ;
286
+
287
+ for ( $ length = 0 ; $ length < $ cnt && $ width > 0 ; $ length ++ ) {
288
+ $ width -= preg_match ( $ eaw_regex , $ chars [ $ length ] ) ? 2 : 1 ;
289
+ }
290
+ // Round down to a minimum of 1.
291
+ if ( $ width < 0 && $ length > 1 ) {
292
+ $ length --;
293
+ }
294
+ return join ( '' , array_slice ( $ chars , 0 , $ length ) );
295
+ }
296
+ }
297
+ return $ str ;
237
298
}
238
299
239
300
/**
@@ -266,18 +327,19 @@ function strwidth( $string, $encoding = false ) {
266
327
// Allow for selective testings - "1" bit set tests grapheme_strlen(), "2" preg_match_all( '/\X/u' ), "4" mb_strwidth(), "other" safe_strlen().
267
328
$ test_strwidth = getenv ( 'PHP_CLI_TOOLS_TEST_STRWIDTH ' );
268
329
269
- // Assume UTF-8 - `grapheme_strlen()` will return null if given non-UTF-8 string.
270
- if ( function_exists ( ' grapheme_strlen ' ) && null !== ( $ width = grapheme_strlen ( $ string ) ) ) {
330
+ // Assume UTF-8 if no encoding given - `grapheme_strlen()` will return null if given non-UTF-8 string.
331
+ if ( ( ! $ encoding || ' UTF-8 ' === $ encoding ) && can_use_icu ( ) && null !== ( $ width = grapheme_strlen ( $ string ) ) ) {
271
332
if ( ! $ test_strwidth || ( $ test_strwidth & 1 ) ) {
272
333
return $ width + preg_match_all ( $ eaw_regex , $ string , $ dummy /*needed for PHP 5.3*/ );
273
334
}
274
335
}
275
- // Assume UTF-8 - `preg_match_all()` will return false if given non-UTF-8 string (or if PCRE UTF-8 mode is unavailable) .
276
- if ( false !== ( $ width = preg_match_all ( '/\X/u ' , $ string , $ dummy /*needed for PHP 5.3*/ ) ) ) {
336
+ // Assume UTF-8 if no encoding given - `preg_match_all()` will return false if given non-UTF-8 string.
337
+ if ( ( ! $ encoding || ' UTF-8 ' === $ encoding ) && can_use_pcre_x () && false !== ( $ width = preg_match_all ( '/\X/u ' , $ string , $ dummy /*needed for PHP 5.3*/ ) ) ) {
277
338
if ( ! $ test_strwidth || ( $ test_strwidth & 2 ) ) {
278
339
return $ width + preg_match_all ( $ eaw_regex , $ string , $ dummy /*needed for PHP 5.3*/ );
279
340
}
280
341
}
342
+ // Legacy encodings and old PHPs will reach here.
281
343
if ( function_exists ( 'mb_strwidth ' ) && ( $ encoding || function_exists ( 'mb_detect_encoding ' ) ) ) {
282
344
if ( ! $ encoding ) {
283
345
$ encoding = mb_detect_encoding ( $ string , null , true /*strict*/ );
@@ -294,6 +356,40 @@ function strwidth( $string, $encoding = false ) {
294
356
return safe_strlen ( $ string , $ encoding );
295
357
}
296
358
359
+ /**
360
+ * Returns whether ICU is modern enough not to flake out.
361
+ *
362
+ * @return bool
363
+ */
364
+ function can_use_icu () {
365
+ static $ can_use_icu = null ;
366
+
367
+ if ( null === $ can_use_icu ) {
368
+ // Choosing ICU 54, Unicode 7.0.
369
+ $ can_use_icu = defined ( 'INTL_ICU_VERSION ' ) && version_compare ( INTL_ICU_VERSION , '54.1 ' , '>= ' ) && function_exists ( 'grapheme_strlen ' ) && function_exists ( 'grapheme_substr ' );
370
+ }
371
+
372
+ return $ can_use_icu ;
373
+ }
374
+
375
+ /**
376
+ * Returns whether PCRE Unicode extended grapheme cluster '\X' is available for use.
377
+ *
378
+ * @return bool
379
+ */
380
+ function can_use_pcre_x () {
381
+ static $ can_use_pcre_x = null ;
382
+
383
+ if ( null === $ can_use_pcre_x ) {
384
+ // '\X' introduced (as Unicde extended grapheme cluster) in PCRE 8.32 - see https://vcs.pcre.org/pcre/code/tags/pcre-8.32/ChangeLog?view=markup line 53.
385
+ // Older versions of PCRE were bundled with PHP <= 5.3.23 & <= 5.4.13.
386
+ $ pcre_version = substr ( PCRE_VERSION , 0 , strspn ( PCRE_VERSION , '0123456789. ' ) ); // Remove any trailing date stuff.
387
+ $ can_use_pcre_x = version_compare ( $ pcre_version , '8.32 ' , '>= ' ) && false !== @preg_match ( '/\X/u ' , '' );
388
+ }
389
+
390
+ return $ can_use_pcre_x ;
391
+ }
392
+
297
393
/**
298
394
* Get the regexs generated from Unicode data.
299
395
*
0 commit comments