Fix JoniRegularExpression compatibility issues with ECMA-262 (#1193)

justin-tay · web-flow · commit 2d9c327674b6 · 2025-09-13T10:28:17.000-04:00
diff --git a/pom.xml b/pom.xml
@@ -74,7 +74,7 @@
 
     <version.itu>1.14.0</version.itu>
     <version.jackson>2.18.3</version.jackson>
-    <version.joni>2.2.1</version.joni>
+    <version.joni>2.2.6</version.joni>
     <version.logback>1.3.14</version.logback> <!-- 1.4.x and above is not Java 8 compatible -->
     <version.slf4j>2.0.17</version.slf4j>
     <version.graaljs>21.3.10</version.graaljs> <!-- 22.x and above is not Java 8 compatible -->
diff --git a/src/main/java/com/networknt/schema/regex/JoniRegularExpression.java b/src/main/java/com/networknt/schema/regex/JoniRegularExpression.java
@@ -1,9 +1,17 @@
 package com.networknt.schema.regex;
 
+import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.regex.Pattern;
 
+import org.jcodings.ApplyAllCaseFoldFunction;
+import org.jcodings.CaseFoldCodeItem;
+import org.jcodings.CodeRange;
+import org.jcodings.Encoding;
+import org.jcodings.IntHolder;
+import org.jcodings.constants.CharacterType;
 import org.jcodings.specific.UTF8Encoding;
+import org.jcodings.unicode.UnicodeCodeRange;
 import org.joni.Option;
 import org.joni.Regex;
 import org.joni.Syntax;
@@ -42,17 +50,8 @@ class JoniRegularExpression implements RegularExpression {
 
     JoniRegularExpression(String regex, Syntax syntax) {
         validate(regex);
-        // Joni is too liberal on some constructs
-        String s = regex
-            .replace("\\d", "[0-9]")
-            .replace("\\D", "[^0-9]")
-            .replace("\\w", "[a-zA-Z0-9_]")
-            .replace("\\W", "[^a-zA-Z0-9_]")
-            .replace("\\s", "[ \\f\\n\\r\\t\\v\\u00a0\\u1680\\u2000-\\u200a\\u2028\\u2029\\u202f\\u205f\\u3000\\ufeff]")
-            .replace("\\S", "[^ \\f\\n\\r\\t\\v\\u00a0\\u1680\\u2000-\\u200a\\u2028\\u2029\\u202f\\u205f\\u3000\\ufeff]");
-
-        byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
-        this.pattern = new Regex(bytes, 0, bytes.length, Option.SINGLELINE, UTF8Encoding.INSTANCE, syntax);
+        byte[] bytes = regex.getBytes(StandardCharsets.UTF_8);
+        this.pattern = new Regex(bytes, 0, bytes.length, Option.SINGLELINE, ECMAScriptUTF8Encoding.INSTANCE, syntax);
     }
 
     protected void validate(String regex) {
@@ -73,4 +72,192 @@ public boolean matches(String value) {
         return this.pattern.matcher(bytes).search(0, bytes.length, Option.NONE) >= 0;
     }
 
+    static class Arrays {
+        public static boolean equals(byte[] a, byte[] a2, int p, int end) {
+            if (a==a2) {
+                return true;
+            }
+            if (a==null || a2==null) {
+                return false;
+            }
+
+            int length = a.length;
+            if ((end - p) != length) {
+                return false;
+            }
+
+            for (int i=0; i<length; i++) {
+                if (a[i] != a2[i+p]) {
+                    return false;
+                }
+            }
+            return true;
+        }
+    }
+
+    /**
+     * An {@link Encoding} that returns the appropriate code ranges that correspond
+     * to the ECMA-262 regular expression implementation instead of matching
+     * directly to a Unicode General Category.
+     */
+    public static class ECMAScriptUTF8Encoding extends DelegatingEncoding {
+        /*
+         * [0-9]
+         */
+        private static final int[] CR_DIGIT = { 1, '0', '9' };
+        /*
+         * [a-zA-Z0-9_]
+         */
+        private static final int[] CR_WORD = { 4, '0', '9', 'A', 'Z', '_', '_', 'a', 'z' };
+        /*
+         * [\f\n\r\t\v\u0020\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]
+         */
+        private static final int[] CR_SPACE = { 10, '\t', '\r', ' ', ' ', '\u00a0', '\u00a0', '\u1680', '\u1680', '\u2000',
+                '\u200a', '\u2028', '\u2029', '\u202f', '\u202f', '\u205f', '\u205f', '\u3000', '\u3000', '\ufeff',
+                '\ufeff' };
+        /*
+         * For \p{digit}
+         */
+        private static final byte[] PROPERTY_NAME_DIGIT = { 100, 105, 103, 105, 116}; 
+
+        public static final ECMAScriptUTF8Encoding INSTANCE = new ECMAScriptUTF8Encoding();
+
+        protected ECMAScriptUTF8Encoding() {
+            super(UTF8Encoding.INSTANCE);
+        }
+
+        @Override
+        public int[] ctypeCodeRange(int ctype, IntHolder sbOut) {
+            switch (ctype) {
+            case CharacterType.DIGIT: // \d
+                sbOut.value = 0x80;
+                return CR_DIGIT;
+            case CharacterType.WORD: // \w
+                sbOut.value = 0x80;
+                return CR_WORD;
+            case CharacterType.SPACE: // \s
+                sbOut.value = 0x80;
+                return CR_SPACE;
+            }
+            return delegate.ctypeCodeRange(ctype, sbOut);
+        }
+
+        @Override
+        public boolean isCodeCType(int code, int ctype) {
+            switch (ctype) {
+            case CharacterType.DIGIT: // \d
+                return CodeRange.isInCodeRange(CR_DIGIT, code); 
+            case CharacterType.WORD: // \w
+                return CodeRange.isInCodeRange(CR_WORD, code); 
+            case CharacterType.SPACE: // \s
+                return CodeRange.isInCodeRange(CR_SPACE, code); 
+            }
+            return delegate.isCodeCType(code, ctype);
+        }
+
+        @Override
+        public int propertyNameToCType(byte[]name, int p, int end) {
+            if (Arrays.equals(PROPERTY_NAME_DIGIT, name, p, end)) {
+                return UnicodeCodeRange.ND.ordinal();// 55 Same as \p{Nd} and not returning CharacterType.DIGIT
+            }
+            return delegate.propertyNameToCType(name, p, end);
+        }
+    }
+
+    /**
+     * An {@link Encoding} that delegates to another {@link Encoding}.
+     * <p>
+     * This can be used to customize the behavior of implementations that are final.
+     */
+    public static class DelegatingEncoding extends Encoding {
+        protected final Encoding delegate;
+        protected DelegatingEncoding(Encoding delegate) {
+            super(new String(delegate.getName()), delegate.minLength(), delegate.maxLength());
+            this.delegate = delegate;
+        }
+        @Override
+        public Charset getCharset() {
+            return delegate.getCharset();
+        }
+        @Override
+        public String getCharsetName() {
+            return delegate.getCharsetName();
+        }
+        @Override
+        public int length(byte c) {
+            return delegate.length(c);
+        }
+        @Override
+        public int length(byte[] bytes, int p, int end) {
+            return delegate.length(bytes, p, end);
+        }
+        @Override
+        public boolean isNewLine(byte[] bytes, int p, int end) {
+            return delegate.isNewLine(bytes, p, end);
+        }
+        @Override
+        public int mbcToCode(byte[] bytes, int p, int end) {
+            return delegate.mbcToCode(bytes, p, end);
+        }
+        @Override
+        public int codeToMbcLength(int code) {
+            return delegate.codeToMbcLength(code);
+        }
+        @Override
+        public int codeToMbc(int code, byte[] bytes, int p) {
+            return delegate.codeToMbc(code, bytes, p);
+        }
+        @Override
+        public int mbcCaseFold(int flag, byte[] bytes, IntHolder pp, int end, byte[] to) {
+            return delegate.mbcCaseFold(flag, bytes, pp, end, to);
+        }
+        @Override
+        public byte[] toLowerCaseTable() {
+            return delegate.toLowerCaseTable();
+        }
+        @Override
+        public void applyAllCaseFold(int flag, ApplyAllCaseFoldFunction fun, Object arg) {
+            delegate.applyAllCaseFold(flag, fun, arg);
+        }
+        @Override
+        public CaseFoldCodeItem[] caseFoldCodesByString(int flag, byte[] bytes, int p, int end) {
+            return delegate.caseFoldCodesByString(flag, bytes, p, end);
+        }
+        @Override
+        public int propertyNameToCType(byte[] bytes, int p, int end) {
+            return delegate.propertyNameToCType(bytes, p, end);
+        }
+        @Override
+        public boolean isCodeCType(int code, int ctype) {
+            return delegate.isCodeCType(code, ctype);
+        }
+        @Override
+        public int[] ctypeCodeRange(int ctype, IntHolder sbOut) {
+            return delegate.ctypeCodeRange(ctype, sbOut);
+        }
+        @Override
+        public int leftAdjustCharHead(byte[] bytes, int p, int s, int end) {
+            return delegate.leftAdjustCharHead(bytes, p, s, end);
+        }
+        @Override
+        public boolean isReverseMatchAllowed(byte[] bytes, int p, int end) {
+            return delegate.isReverseMatchAllowed(bytes, p, end);
+        }
+        @Override
+        public int caseMap(IntHolder flagP, byte[] bytes, IntHolder pp, int end, byte[] to, int toP, int toEnd) {
+            return delegate.caseMap(flagP, bytes, pp, end, to, toP, toEnd);
+        }
+        @Override
+        public int strLength(byte[] bytes, int p, int end) {
+            return delegate.strLength(bytes, p, end);
+        }
+        @Override
+        public int strCodeAt(byte[] bytes, int p, int end, int index) {
+            return delegate.strCodeAt(bytes, p, end, index);
+        }
+        @Override
+        public boolean isMbcCrnl(byte[] bytes, int p, int end) {
+            return delegate.isMbcCrnl(bytes, p, end);
+        }
+    }
 }
diff --git a/src/test/java/com/networknt/schema/regex/JoniRegularExpressionTest.java b/src/test/java/com/networknt/schema/regex/JoniRegularExpressionTest.java
@@ -172,4 +172,82 @@ void noImplicitAnchors() {
         RegularExpression regex = new JoniRegularExpression("[a-z]{1,10}");
         assertTrue(regex.matches("1abc1"));
     }
+
+    @Test
+    void digitCharacterClassShouldNotMatchUnicodeDigit() {
+        RegularExpression regex = new JoniRegularExpression("\\d");
+        assertFalse(regex.matches("߀"));
+    }
+
+    @Test
+    void wordCharacterClassShouldNotMatchUnicodeDigit() {
+        RegularExpression regex = new JoniRegularExpression("\\w");
+        assertFalse(regex.matches("߀"));
+    }
+
+    @Test
+    void unicodeNumberCharacterClassShouldMatchUnicodeDigit() {
+        RegularExpression regex = new JoniRegularExpression("\\p{N}");
+        assertTrue(regex.matches("߀"));
+    }
+
+    @Test
+    void unicodeNumberDigitCharacterClassShouldMatchUnicodeDigit() {
+        RegularExpression regex = new JoniRegularExpression("\\p{digit}");
+        assertTrue(regex.matches("߀"));
+    }
+
+    @Test
+    void unicodeNdCharacterClassShouldMatchUnicodeDigit() {
+        RegularExpression regex = new JoniRegularExpression("\\p{Nd}");
+        assertTrue(regex.matches("߀"));
+    }
+
+    @Test
+    void digitCharacterClassShouldMatchAsciiDigit() {
+        RegularExpression regex = new JoniRegularExpression("\\d");
+        assertTrue(regex.matches("0"));
+    }
+
+    @Test
+    void digitCharacterClassShouldMatchAsciiDigitInCharacterSet() {
+        RegularExpression regex = new JoniRegularExpression("[\\d]");
+        assertTrue(regex.matches("0"));
+    }
+
+    @Test
+    void whitespaceClassShouldMatchWhitespace() {
+        RegularExpression regex = new JoniRegularExpression("\\s");
+        assertTrue(regex.matches(" "));
+    }
+
+    @Test
+    void whitespaceClassShouldMatchLatin1NonBreakingSpace() {
+        RegularExpression regex = new JoniRegularExpression("\\s");
+        assertTrue(regex.matches("\u00a0"));
+    }
+
+    @Test
+    void whitespaceClassShouldMatchWhitespaceInCharacterSet() {
+        RegularExpression regex = new JoniRegularExpression("[\\s]");
+        assertTrue(regex.matches(" "));
+    }
+
+    @Test
+    void whitespaceClassShouldMatchLatin1NonBreakingSpaceInCharacterSet() {
+        RegularExpression regex = new JoniRegularExpression("[\\s]");
+        assertTrue(regex.matches("\u00a0"));
+    }
+
+    @Test
+    void nonWhitespaceClassShouldNotMatchWhitespaceInCharacterSet() {
+        RegularExpression regex = new JoniRegularExpression("[\\S]");
+        assertFalse(regex.matches(" "));
+    }
+
+    @Test
+    void nonWhitespaceClassShouldNotMatchLatin1NonBreakingSpaceInCharacterSet() {
+        RegularExpression regex = new JoniRegularExpression("[\\S]");
+        assertFalse(regex.matches("\u00a0"));
+    }
 }