1
1
package com .networknt .schema .regex ;
2
2
3
+ import java .nio .charset .Charset ;
3
4
import java .nio .charset .StandardCharsets ;
4
5
import java .util .regex .Pattern ;
5
6
7
+ import org .jcodings .ApplyAllCaseFoldFunction ;
8
+ import org .jcodings .CaseFoldCodeItem ;
9
+ import org .jcodings .CodeRange ;
10
+ import org .jcodings .Encoding ;
11
+ import org .jcodings .IntHolder ;
12
+ import org .jcodings .constants .CharacterType ;
6
13
import org .jcodings .specific .UTF8Encoding ;
14
+ import org .jcodings .unicode .UnicodeCodeRange ;
7
15
import org .joni .Option ;
8
16
import org .joni .Regex ;
9
17
import org .joni .Syntax ;
@@ -42,17 +50,8 @@ class JoniRegularExpression implements RegularExpression {
42
50
43
51
JoniRegularExpression (String regex , Syntax syntax ) {
44
52
validate (regex );
45
- // Joni is too liberal on some constructs
46
- String s = regex
47
- .replace ("\\ d" , "[0-9]" )
48
- .replace ("\\ D" , "[^0-9]" )
49
- .replace ("\\ w" , "[a-zA-Z0-9_]" )
50
- .replace ("\\ W" , "[^a-zA-Z0-9_]" )
51
- .replace ("\\ s" , "[ \\ f\\ n\\ r\\ t\\ v\\ u00a0\\ u1680\\ u2000-\\ u200a\\ u2028\\ u2029\\ u202f\\ u205f\\ u3000\\ ufeff]" )
52
- .replace ("\\ S" , "[^ \\ f\\ n\\ r\\ t\\ v\\ u00a0\\ u1680\\ u2000-\\ u200a\\ u2028\\ u2029\\ u202f\\ u205f\\ u3000\\ ufeff]" );
53
-
54
- byte [] bytes = s .getBytes (StandardCharsets .UTF_8 );
55
- this .pattern = new Regex (bytes , 0 , bytes .length , Option .SINGLELINE , UTF8Encoding .INSTANCE , syntax );
53
+ byte [] bytes = regex .getBytes (StandardCharsets .UTF_8 );
54
+ this .pattern = new Regex (bytes , 0 , bytes .length , Option .SINGLELINE , ECMAScriptUTF8Encoding .INSTANCE , syntax );
56
55
}
57
56
58
57
protected void validate (String regex ) {
@@ -73,4 +72,192 @@ public boolean matches(String value) {
73
72
return this .pattern .matcher (bytes ).search (0 , bytes .length , Option .NONE ) >= 0 ;
74
73
}
75
74
75
+ static class Arrays {
76
+ public static boolean equals (byte [] a , byte [] a2 , int p , int end ) {
77
+ if (a ==a2 ) {
78
+ return true ;
79
+ }
80
+ if (a ==null || a2 ==null ) {
81
+ return false ;
82
+ }
83
+
84
+ int length = a .length ;
85
+ if ((end - p ) != length ) {
86
+ return false ;
87
+ }
88
+
89
+ for (int i =0 ; i <length ; i ++) {
90
+ if (a [i ] != a2 [i +p ]) {
91
+ return false ;
92
+ }
93
+ }
94
+ return true ;
95
+ }
96
+ }
97
+
98
+ /**
99
+ * An {@link Encoding} that returns the appropriate code ranges that correspond
100
+ * to the ECMA-262 regular expression implementation instead of matching
101
+ * directly to a Unicode General Category.
102
+ */
103
+ public static class ECMAScriptUTF8Encoding extends DelegatingEncoding {
104
+ /*
105
+ * [0-9]
106
+ */
107
+ private static final int [] CR_DIGIT = { 1 , '0' , '9' };
108
+ /*
109
+ * [a-zA-Z0-9_]
110
+ */
111
+ private static final int [] CR_WORD = { 4 , '0' , '9' , 'A' , 'Z' , '_' , '_' , 'a' , 'z' };
112
+ /*
113
+ * [\f\n\r\t\v\u0020\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]
114
+ */
115
+ private static final int [] CR_SPACE = { 10 , '\t' , '\r' , ' ' , ' ' , '\u00a0' , '\u00a0' , '\u1680' , '\u1680' , '\u2000' ,
116
+ '\u200a' , '\u2028' , '\u2029' , '\u202f' , '\u202f' , '\u205f' , '\u205f' , '\u3000' , '\u3000' , '\ufeff' ,
117
+ '\ufeff' };
118
+ /*
119
+ * For \p{digit}
120
+ */
121
+ private static final byte [] PROPERTY_NAME_DIGIT = { 100 , 105 , 103 , 105 , 116 };
122
+
123
+ public static final ECMAScriptUTF8Encoding INSTANCE = new ECMAScriptUTF8Encoding ();
124
+
125
+ protected ECMAScriptUTF8Encoding () {
126
+ super (UTF8Encoding .INSTANCE );
127
+ }
128
+
129
+ @ Override
130
+ public int [] ctypeCodeRange (int ctype , IntHolder sbOut ) {
131
+ switch (ctype ) {
132
+ case CharacterType .DIGIT : // \d
133
+ sbOut .value = 0x80 ;
134
+ return CR_DIGIT ;
135
+ case CharacterType .WORD : // \w
136
+ sbOut .value = 0x80 ;
137
+ return CR_WORD ;
138
+ case CharacterType .SPACE : // \s
139
+ sbOut .value = 0x80 ;
140
+ return CR_SPACE ;
141
+ }
142
+ return delegate .ctypeCodeRange (ctype , sbOut );
143
+ }
144
+
145
+ @ Override
146
+ public boolean isCodeCType (int code , int ctype ) {
147
+ switch (ctype ) {
148
+ case CharacterType .DIGIT : // \d
149
+ return CodeRange .isInCodeRange (CR_DIGIT , code );
150
+ case CharacterType .WORD : // \w
151
+ return CodeRange .isInCodeRange (CR_WORD , code );
152
+ case CharacterType .SPACE : // \s
153
+ return CodeRange .isInCodeRange (CR_SPACE , code );
154
+ }
155
+ return delegate .isCodeCType (code , ctype );
156
+ }
157
+
158
+ @ Override
159
+ public int propertyNameToCType (byte []name , int p , int end ) {
160
+ if (Arrays .equals (PROPERTY_NAME_DIGIT , name , p , end )) {
161
+ return UnicodeCodeRange .ND .ordinal ();// 55 Same as \p{Nd} and not returning CharacterType.DIGIT
162
+ }
163
+ return delegate .propertyNameToCType (name , p , end );
164
+ }
165
+ }
166
+
167
+ /**
168
+ * An {@link Encoding} that delegates to another {@link Encoding}.
169
+ * <p>
170
+ * This can be used to customize the behavior of implementations that are final.
171
+ */
172
+ public static class DelegatingEncoding extends Encoding {
173
+ protected final Encoding delegate ;
174
+ protected DelegatingEncoding (Encoding delegate ) {
175
+ super (new String (delegate .getName ()), delegate .minLength (), delegate .maxLength ());
176
+ this .delegate = delegate ;
177
+ }
178
+ @ Override
179
+ public Charset getCharset () {
180
+ return delegate .getCharset ();
181
+ }
182
+ @ Override
183
+ public String getCharsetName () {
184
+ return delegate .getCharsetName ();
185
+ }
186
+ @ Override
187
+ public int length (byte c ) {
188
+ return delegate .length (c );
189
+ }
190
+ @ Override
191
+ public int length (byte [] bytes , int p , int end ) {
192
+ return delegate .length (bytes , p , end );
193
+ }
194
+ @ Override
195
+ public boolean isNewLine (byte [] bytes , int p , int end ) {
196
+ return delegate .isNewLine (bytes , p , end );
197
+ }
198
+ @ Override
199
+ public int mbcToCode (byte [] bytes , int p , int end ) {
200
+ return delegate .mbcToCode (bytes , p , end );
201
+ }
202
+ @ Override
203
+ public int codeToMbcLength (int code ) {
204
+ return delegate .codeToMbcLength (code );
205
+ }
206
+ @ Override
207
+ public int codeToMbc (int code , byte [] bytes , int p ) {
208
+ return delegate .codeToMbc (code , bytes , p );
209
+ }
210
+ @ Override
211
+ public int mbcCaseFold (int flag , byte [] bytes , IntHolder pp , int end , byte [] to ) {
212
+ return delegate .mbcCaseFold (flag , bytes , pp , end , to );
213
+ }
214
+ @ Override
215
+ public byte [] toLowerCaseTable () {
216
+ return delegate .toLowerCaseTable ();
217
+ }
218
+ @ Override
219
+ public void applyAllCaseFold (int flag , ApplyAllCaseFoldFunction fun , Object arg ) {
220
+ delegate .applyAllCaseFold (flag , fun , arg );
221
+ }
222
+ @ Override
223
+ public CaseFoldCodeItem [] caseFoldCodesByString (int flag , byte [] bytes , int p , int end ) {
224
+ return delegate .caseFoldCodesByString (flag , bytes , p , end );
225
+ }
226
+ @ Override
227
+ public int propertyNameToCType (byte [] bytes , int p , int end ) {
228
+ return delegate .propertyNameToCType (bytes , p , end );
229
+ }
230
+ @ Override
231
+ public boolean isCodeCType (int code , int ctype ) {
232
+ return delegate .isCodeCType (code , ctype );
233
+ }
234
+ @ Override
235
+ public int [] ctypeCodeRange (int ctype , IntHolder sbOut ) {
236
+ return delegate .ctypeCodeRange (ctype , sbOut );
237
+ }
238
+ @ Override
239
+ public int leftAdjustCharHead (byte [] bytes , int p , int s , int end ) {
240
+ return delegate .leftAdjustCharHead (bytes , p , s , end );
241
+ }
242
+ @ Override
243
+ public boolean isReverseMatchAllowed (byte [] bytes , int p , int end ) {
244
+ return delegate .isReverseMatchAllowed (bytes , p , end );
245
+ }
246
+ @ Override
247
+ public int caseMap (IntHolder flagP , byte [] bytes , IntHolder pp , int end , byte [] to , int toP , int toEnd ) {
248
+ return delegate .caseMap (flagP , bytes , pp , end , to , toP , toEnd );
249
+ }
250
+ @ Override
251
+ public int strLength (byte [] bytes , int p , int end ) {
252
+ return delegate .strLength (bytes , p , end );
253
+ }
254
+ @ Override
255
+ public int strCodeAt (byte [] bytes , int p , int end , int index ) {
256
+ return delegate .strCodeAt (bytes , p , end , index );
257
+ }
258
+ @ Override
259
+ public boolean isMbcCrnl (byte [] bytes , int p , int end ) {
260
+ return delegate .isMbcCrnl (bytes , p , end );
261
+ }
262
+ }
76
263
}
0 commit comments