Skip to content

Commit 2d9c327

Browse files
authored
Fix JoniRegularExpression compatibility issues with ECMA-262 (#1193)
1 parent 5c85cd5 commit 2d9c327

File tree

3 files changed

+277
-12
lines changed

3 files changed

+277
-12
lines changed

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@
7474

7575
<version.itu>1.14.0</version.itu>
7676
<version.jackson>2.18.3</version.jackson>
77-
<version.joni>2.2.1</version.joni>
77+
<version.joni>2.2.6</version.joni>
7878
<version.logback>1.3.14</version.logback> <!-- 1.4.x and above is not Java 8 compatible -->
7979
<version.slf4j>2.0.17</version.slf4j>
8080
<version.graaljs>21.3.10</version.graaljs> <!-- 22.x and above is not Java 8 compatible -->

src/main/java/com/networknt/schema/regex/JoniRegularExpression.java

Lines changed: 198 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
11
package com.networknt.schema.regex;
22

3+
import java.nio.charset.Charset;
34
import java.nio.charset.StandardCharsets;
45
import java.util.regex.Pattern;
56

7+
import org.jcodings.ApplyAllCaseFoldFunction;
8+
import org.jcodings.CaseFoldCodeItem;
9+
import org.jcodings.CodeRange;
10+
import org.jcodings.Encoding;
11+
import org.jcodings.IntHolder;
12+
import org.jcodings.constants.CharacterType;
613
import org.jcodings.specific.UTF8Encoding;
14+
import org.jcodings.unicode.UnicodeCodeRange;
715
import org.joni.Option;
816
import org.joni.Regex;
917
import org.joni.Syntax;
@@ -42,17 +50,8 @@ class JoniRegularExpression implements RegularExpression {
4250

4351
JoniRegularExpression(String regex, Syntax syntax) {
4452
validate(regex);
45-
// Joni is too liberal on some constructs
46-
String s = regex
47-
.replace("\\d", "[0-9]")
48-
.replace("\\D", "[^0-9]")
49-
.replace("\\w", "[a-zA-Z0-9_]")
50-
.replace("\\W", "[^a-zA-Z0-9_]")
51-
.replace("\\s", "[ \\f\\n\\r\\t\\v\\u00a0\\u1680\\u2000-\\u200a\\u2028\\u2029\\u202f\\u205f\\u3000\\ufeff]")
52-
.replace("\\S", "[^ \\f\\n\\r\\t\\v\\u00a0\\u1680\\u2000-\\u200a\\u2028\\u2029\\u202f\\u205f\\u3000\\ufeff]");
53-
54-
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
55-
this.pattern = new Regex(bytes, 0, bytes.length, Option.SINGLELINE, UTF8Encoding.INSTANCE, syntax);
53+
byte[] bytes = regex.getBytes(StandardCharsets.UTF_8);
54+
this.pattern = new Regex(bytes, 0, bytes.length, Option.SINGLELINE, ECMAScriptUTF8Encoding.INSTANCE, syntax);
5655
}
5756

5857
protected void validate(String regex) {
@@ -73,4 +72,192 @@ public boolean matches(String value) {
7372
return this.pattern.matcher(bytes).search(0, bytes.length, Option.NONE) >= 0;
7473
}
7574

75+
static class Arrays {
76+
public static boolean equals(byte[] a, byte[] a2, int p, int end) {
77+
if (a==a2) {
78+
return true;
79+
}
80+
if (a==null || a2==null) {
81+
return false;
82+
}
83+
84+
int length = a.length;
85+
if ((end - p) != length) {
86+
return false;
87+
}
88+
89+
for (int i=0; i<length; i++) {
90+
if (a[i] != a2[i+p]) {
91+
return false;
92+
}
93+
}
94+
return true;
95+
}
96+
}
97+
98+
/**
99+
* An {@link Encoding} that returns the appropriate code ranges that correspond
100+
* to the ECMA-262 regular expression implementation instead of matching
101+
* directly to a Unicode General Category.
102+
*/
103+
public static class ECMAScriptUTF8Encoding extends DelegatingEncoding {
104+
/*
105+
* [0-9]
106+
*/
107+
private static final int[] CR_DIGIT = { 1, '0', '9' };
108+
/*
109+
* [a-zA-Z0-9_]
110+
*/
111+
private static final int[] CR_WORD = { 4, '0', '9', 'A', 'Z', '_', '_', 'a', 'z' };
112+
/*
113+
* [\f\n\r\t\v\u0020\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]
114+
*/
115+
private static final int[] CR_SPACE = { 10, '\t', '\r', ' ', ' ', '\u00a0', '\u00a0', '\u1680', '\u1680', '\u2000',
116+
'\u200a', '\u2028', '\u2029', '\u202f', '\u202f', '\u205f', '\u205f', '\u3000', '\u3000', '\ufeff',
117+
'\ufeff' };
118+
/*
119+
* For \p{digit}
120+
*/
121+
private static final byte[] PROPERTY_NAME_DIGIT = { 100, 105, 103, 105, 116};
122+
123+
public static final ECMAScriptUTF8Encoding INSTANCE = new ECMAScriptUTF8Encoding();
124+
125+
protected ECMAScriptUTF8Encoding() {
126+
super(UTF8Encoding.INSTANCE);
127+
}
128+
129+
@Override
130+
public int[] ctypeCodeRange(int ctype, IntHolder sbOut) {
131+
switch (ctype) {
132+
case CharacterType.DIGIT: // \d
133+
sbOut.value = 0x80;
134+
return CR_DIGIT;
135+
case CharacterType.WORD: // \w
136+
sbOut.value = 0x80;
137+
return CR_WORD;
138+
case CharacterType.SPACE: // \s
139+
sbOut.value = 0x80;
140+
return CR_SPACE;
141+
}
142+
return delegate.ctypeCodeRange(ctype, sbOut);
143+
}
144+
145+
@Override
146+
public boolean isCodeCType(int code, int ctype) {
147+
switch (ctype) {
148+
case CharacterType.DIGIT: // \d
149+
return CodeRange.isInCodeRange(CR_DIGIT, code);
150+
case CharacterType.WORD: // \w
151+
return CodeRange.isInCodeRange(CR_WORD, code);
152+
case CharacterType.SPACE: // \s
153+
return CodeRange.isInCodeRange(CR_SPACE, code);
154+
}
155+
return delegate.isCodeCType(code, ctype);
156+
}
157+
158+
@Override
159+
public int propertyNameToCType(byte[]name, int p, int end) {
160+
if (Arrays.equals(PROPERTY_NAME_DIGIT, name, p, end)) {
161+
return UnicodeCodeRange.ND.ordinal();// 55 Same as \p{Nd} and not returning CharacterType.DIGIT
162+
}
163+
return delegate.propertyNameToCType(name, p, end);
164+
}
165+
}
166+
167+
/**
168+
* An {@link Encoding} that delegates to another {@link Encoding}.
169+
* <p>
170+
* This can be used to customize the behavior of implementations that are final.
171+
*/
172+
public static class DelegatingEncoding extends Encoding {
173+
protected final Encoding delegate;
174+
protected DelegatingEncoding(Encoding delegate) {
175+
super(new String(delegate.getName()), delegate.minLength(), delegate.maxLength());
176+
this.delegate = delegate;
177+
}
178+
@Override
179+
public Charset getCharset() {
180+
return delegate.getCharset();
181+
}
182+
@Override
183+
public String getCharsetName() {
184+
return delegate.getCharsetName();
185+
}
186+
@Override
187+
public int length(byte c) {
188+
return delegate.length(c);
189+
}
190+
@Override
191+
public int length(byte[] bytes, int p, int end) {
192+
return delegate.length(bytes, p, end);
193+
}
194+
@Override
195+
public boolean isNewLine(byte[] bytes, int p, int end) {
196+
return delegate.isNewLine(bytes, p, end);
197+
}
198+
@Override
199+
public int mbcToCode(byte[] bytes, int p, int end) {
200+
return delegate.mbcToCode(bytes, p, end);
201+
}
202+
@Override
203+
public int codeToMbcLength(int code) {
204+
return delegate.codeToMbcLength(code);
205+
}
206+
@Override
207+
public int codeToMbc(int code, byte[] bytes, int p) {
208+
return delegate.codeToMbc(code, bytes, p);
209+
}
210+
@Override
211+
public int mbcCaseFold(int flag, byte[] bytes, IntHolder pp, int end, byte[] to) {
212+
return delegate.mbcCaseFold(flag, bytes, pp, end, to);
213+
}
214+
@Override
215+
public byte[] toLowerCaseTable() {
216+
return delegate.toLowerCaseTable();
217+
}
218+
@Override
219+
public void applyAllCaseFold(int flag, ApplyAllCaseFoldFunction fun, Object arg) {
220+
delegate.applyAllCaseFold(flag, fun, arg);
221+
}
222+
@Override
223+
public CaseFoldCodeItem[] caseFoldCodesByString(int flag, byte[] bytes, int p, int end) {
224+
return delegate.caseFoldCodesByString(flag, bytes, p, end);
225+
}
226+
@Override
227+
public int propertyNameToCType(byte[] bytes, int p, int end) {
228+
return delegate.propertyNameToCType(bytes, p, end);
229+
}
230+
@Override
231+
public boolean isCodeCType(int code, int ctype) {
232+
return delegate.isCodeCType(code, ctype);
233+
}
234+
@Override
235+
public int[] ctypeCodeRange(int ctype, IntHolder sbOut) {
236+
return delegate.ctypeCodeRange(ctype, sbOut);
237+
}
238+
@Override
239+
public int leftAdjustCharHead(byte[] bytes, int p, int s, int end) {
240+
return delegate.leftAdjustCharHead(bytes, p, s, end);
241+
}
242+
@Override
243+
public boolean isReverseMatchAllowed(byte[] bytes, int p, int end) {
244+
return delegate.isReverseMatchAllowed(bytes, p, end);
245+
}
246+
@Override
247+
public int caseMap(IntHolder flagP, byte[] bytes, IntHolder pp, int end, byte[] to, int toP, int toEnd) {
248+
return delegate.caseMap(flagP, bytes, pp, end, to, toP, toEnd);
249+
}
250+
@Override
251+
public int strLength(byte[] bytes, int p, int end) {
252+
return delegate.strLength(bytes, p, end);
253+
}
254+
@Override
255+
public int strCodeAt(byte[] bytes, int p, int end, int index) {
256+
return delegate.strCodeAt(bytes, p, end, index);
257+
}
258+
@Override
259+
public boolean isMbcCrnl(byte[] bytes, int p, int end) {
260+
return delegate.isMbcCrnl(bytes, p, end);
261+
}
262+
}
76263
}

src/test/java/com/networknt/schema/regex/JoniRegularExpressionTest.java

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,4 +172,82 @@ void noImplicitAnchors() {
172172
RegularExpression regex = new JoniRegularExpression("[a-z]{1,10}");
173173
assertTrue(regex.matches("1abc1"));
174174
}
175+
176+
@Test
177+
void digitCharacterClassShouldNotMatchUnicodeDigit() {
178+
RegularExpression regex = new JoniRegularExpression("\\d");
179+
assertFalse(regex.matches("߀"));
180+
}
181+
182+
@Test
183+
void wordCharacterClassShouldNotMatchUnicodeDigit() {
184+
RegularExpression regex = new JoniRegularExpression("\\w");
185+
assertFalse(regex.matches("߀"));
186+
}
187+
188+
@Test
189+
void unicodeNumberCharacterClassShouldMatchUnicodeDigit() {
190+
RegularExpression regex = new JoniRegularExpression("\\p{N}");
191+
assertTrue(regex.matches("߀"));
192+
}
193+
194+
@Test
195+
void unicodeNumberDigitCharacterClassShouldMatchUnicodeDigit() {
196+
RegularExpression regex = new JoniRegularExpression("\\p{digit}");
197+
assertTrue(regex.matches("߀"));
198+
}
199+
200+
@Test
201+
void unicodeNdCharacterClassShouldMatchUnicodeDigit() {
202+
RegularExpression regex = new JoniRegularExpression("\\p{Nd}");
203+
assertTrue(regex.matches("߀"));
204+
}
205+
206+
@Test
207+
void digitCharacterClassShouldMatchAsciiDigit() {
208+
RegularExpression regex = new JoniRegularExpression("\\d");
209+
assertTrue(regex.matches("0"));
210+
}
211+
212+
@Test
213+
void digitCharacterClassShouldMatchAsciiDigitInCharacterSet() {
214+
RegularExpression regex = new JoniRegularExpression("[\\d]");
215+
assertTrue(regex.matches("0"));
216+
}
217+
218+
@Test
219+
void whitespaceClassShouldMatchWhitespace() {
220+
RegularExpression regex = new JoniRegularExpression("\\s");
221+
assertTrue(regex.matches(" "));
222+
}
223+
224+
@Test
225+
void whitespaceClassShouldMatchLatin1NonBreakingSpace() {
226+
RegularExpression regex = new JoniRegularExpression("\\s");
227+
assertTrue(regex.matches("\u00a0"));
228+
}
229+
230+
@Test
231+
void whitespaceClassShouldMatchWhitespaceInCharacterSet() {
232+
RegularExpression regex = new JoniRegularExpression("[\\s]");
233+
assertTrue(regex.matches(" "));
234+
}
235+
236+
@Test
237+
void whitespaceClassShouldMatchLatin1NonBreakingSpaceInCharacterSet() {
238+
RegularExpression regex = new JoniRegularExpression("[\\s]");
239+
assertTrue(regex.matches("\u00a0"));
240+
}
241+
242+
@Test
243+
void nonWhitespaceClassShouldNotMatchWhitespaceInCharacterSet() {
244+
RegularExpression regex = new JoniRegularExpression("[\\S]");
245+
assertFalse(regex.matches(" "));
246+
}
247+
248+
@Test
249+
void nonWhitespaceClassShouldNotMatchLatin1NonBreakingSpaceInCharacterSet() {
250+
RegularExpression regex = new JoniRegularExpression("[\\S]");
251+
assertFalse(regex.matches("\u00a0"));
252+
}
175253
}

0 commit comments

Comments
 (0)