|
18 | 18 | import static java.lang.Character.isSurrogatePair; |
19 | 19 | import static java.lang.Character.toCodePoint; |
20 | 20 |
|
| 21 | +import java.nio.BufferOverflowException; |
21 | 22 | import java.nio.ByteBuffer; |
22 | 23 | import java.util.Arrays; |
23 | 24 |
|
@@ -748,86 +749,13 @@ final void encodeUtf8(String in, ByteBuffer out) { |
748 | 749 | final int offset = out.arrayOffset(); |
749 | 750 | int endIndex = Utf8.encode(in, out.array(), offset + out.position(), out.remaining()); |
750 | 751 | Java8Compatibility.position(out, endIndex - offset); |
751 | | - } else if (out.isDirect()) { |
752 | | - encodeUtf8Direct(in, out); |
753 | 752 | } else { |
754 | | - encodeUtf8Default(in, out); |
| 753 | + encodeUtf8Internal(in, out); |
755 | 754 | } |
756 | 755 | } |
757 | 756 |
|
758 | 757 | /** Encodes the input character sequence to a direct {@link ByteBuffer} instance. */ |
759 | | - abstract void encodeUtf8Direct(String in, ByteBuffer out); |
760 | | - |
761 | | - /** |
762 | | - * Encodes the input character sequence to a {@link ByteBuffer} instance using the {@link |
763 | | - * ByteBuffer} API, rather than potentially faster approaches. |
764 | | - */ |
765 | | - final void encodeUtf8Default(String in, ByteBuffer out) { |
766 | | - final int inLength = in.length(); |
767 | | - int outIx = out.position(); |
768 | | - int inIx = 0; |
769 | | - |
770 | | - // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check |
771 | | - // access. Assume the buffer is big enough and let it handle the out of bounds exception |
772 | | - // if it occurs. |
773 | | - try { |
774 | | - // Designed to take advantage of |
775 | | - // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination |
776 | | - for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) { |
777 | | - out.put(outIx + inIx, (byte) c); |
778 | | - } |
779 | | - if (inIx == inLength) { |
780 | | - // Successfully encoded the entire string. |
781 | | - Java8Compatibility.position(out, outIx + inIx); |
782 | | - return; |
783 | | - } |
784 | | - |
785 | | - outIx += inIx; |
786 | | - for (char c; inIx < inLength; ++inIx, ++outIx) { |
787 | | - c = in.charAt(inIx); |
788 | | - if (c < 0x80) { |
789 | | - // One byte (0xxx xxxx) |
790 | | - out.put(outIx, (byte) c); |
791 | | - } else if (c < 0x800) { |
792 | | - // Two bytes (110x xxxx 10xx xxxx) |
793 | | - |
794 | | - // Benchmarks show put performs better than putShort here (for HotSpot). |
795 | | - out.put(outIx++, (byte) (0xC0 | (c >>> 6))); |
796 | | - out.put(outIx, (byte) (0x80 | (0x3F & c))); |
797 | | - } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) { |
798 | | - // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx) |
799 | | - // Maximum single-char code point is 0xFFFF, 16 bits. |
800 | | - |
801 | | - // Benchmarks show put performs better than putShort here (for HotSpot). |
802 | | - out.put(outIx++, (byte) (0xE0 | (c >>> 12))); |
803 | | - out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); |
804 | | - out.put(outIx, (byte) (0x80 | (0x3F & c))); |
805 | | - } else { |
806 | | - // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx) |
807 | | - |
808 | | - // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 |
809 | | - // bytes |
810 | | - final char low; |
811 | | - if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { |
812 | | - throw new UnpairedSurrogateException(inIx, inLength); |
813 | | - } |
814 | | - // TODO: Consider using putInt() to improve performance. |
815 | | - int codePoint = toCodePoint(c, low); |
816 | | - out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); |
817 | | - out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); |
818 | | - out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); |
819 | | - out.put(outIx, (byte) (0x80 | (0x3F & codePoint))); |
820 | | - } |
821 | | - } |
822 | | - |
823 | | - // Successfully encoded the entire string. |
824 | | - Java8Compatibility.position(out, outIx); |
825 | | - } catch (IndexOutOfBoundsException e) { |
826 | | - // TODO: Consider making the API throw IndexOutOfBoundsException instead. |
827 | | - throw new ArrayIndexOutOfBoundsException( |
828 | | - "Not enough space in output buffer to encode UTF-8 string"); |
829 | | - } |
830 | | - } |
| 758 | + protected abstract void encodeUtf8Internal(String in, ByteBuffer out); |
831 | 759 | } |
832 | 760 |
|
833 | 761 | /** {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. */ |
@@ -1062,9 +990,71 @@ int encodeUtf8(String in, byte[] out, int offset, int length) { |
1062 | 990 | } |
1063 | 991 |
|
1064 | 992 | @Override |
1065 | | - void encodeUtf8Direct(String in, ByteBuffer out) { |
1066 | | - // For safe processing, we have to use the ByteBuffer API. |
1067 | | - encodeUtf8Default(in, out); |
| 993 | + protected void encodeUtf8Internal(String in, ByteBuffer out) { |
| 994 | + final int inLength = in.length(); |
| 995 | + int outIx = out.position(); |
| 996 | + int inIx = 0; |
| 997 | + |
| 998 | + // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check |
| 999 | + // access. Assume the buffer is big enough and let it handle the out of bounds exception |
| 1000 | + // if it occurs. |
| 1001 | + try { |
| 1002 | + // Designed to take advantage of |
| 1003 | + // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination |
| 1004 | + for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) { |
| 1005 | + out.put(outIx + inIx, (byte) c); |
| 1006 | + } |
| 1007 | + if (inIx == inLength) { |
| 1008 | + // Successfully encoded the entire string. |
| 1009 | + Java8Compatibility.position(out, outIx + inIx); |
| 1010 | + return; |
| 1011 | + } |
| 1012 | + |
| 1013 | + outIx += inIx; |
| 1014 | + for (char c; inIx < inLength; ++inIx, ++outIx) { |
| 1015 | + c = in.charAt(inIx); |
| 1016 | + if (c < 0x80) { |
| 1017 | + // One byte (0xxx xxxx) |
| 1018 | + out.put(outIx, (byte) c); |
| 1019 | + } else if (c < 0x800) { |
| 1020 | + // Two bytes (110x xxxx 10xx xxxx) |
| 1021 | + |
| 1022 | + // Benchmarks show put performs better than putShort here (for HotSpot). |
| 1023 | + out.put(outIx++, (byte) (0xC0 | (c >>> 6))); |
| 1024 | + out.put(outIx, (byte) (0x80 | (0x3F & c))); |
| 1025 | + } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) { |
| 1026 | + // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx) |
| 1027 | + // Maximum single-char code point is 0xFFFF, 16 bits. |
| 1028 | + |
| 1029 | + // Benchmarks show put performs better than putShort here (for HotSpot). |
| 1030 | + out.put(outIx++, (byte) (0xE0 | (c >>> 12))); |
| 1031 | + out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); |
| 1032 | + out.put(outIx, (byte) (0x80 | (0x3F & c))); |
| 1033 | + } else { |
| 1034 | + // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx) |
| 1035 | + |
| 1036 | + // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 |
| 1037 | + // bytes |
| 1038 | + final char low; |
| 1039 | + if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { |
| 1040 | + throw new UnpairedSurrogateException(inIx, inLength); |
| 1041 | + } |
| 1042 | + // TODO: Consider using putInt() to improve performance. |
| 1043 | + int codePoint = toCodePoint(c, low); |
| 1044 | + out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); |
| 1045 | + out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); |
| 1046 | + out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); |
| 1047 | + out.put(outIx, (byte) (0x80 | (0x3F & codePoint))); |
| 1048 | + } |
| 1049 | + } |
| 1050 | + |
| 1051 | + // Successfully encoded the entire string. |
| 1052 | + Java8Compatibility.position(out, outIx); |
| 1053 | + } catch (IndexOutOfBoundsException unused) { |
| 1054 | + // TODO: Consider making the API throw IndexOutOfBoundsException instead. |
| 1055 | + throw new ArrayIndexOutOfBoundsException( |
| 1056 | + "Not enough space in output buffer to encode UTF-8 string"); |
| 1057 | + } |
1068 | 1058 | } |
1069 | 1059 |
|
1070 | 1060 | private static int partialIsValidUtf8(byte[] bytes, int index, int limit) { |
@@ -1450,67 +1440,14 @@ int encodeUtf8(final String in, final byte[] out, final int offset, final int le |
1450 | 1440 | } |
1451 | 1441 |
|
1452 | 1442 | @Override |
1453 | | - void encodeUtf8Direct(String in, ByteBuffer out) { |
1454 | | - final long address = addressOffset(out); |
1455 | | - long outIx = address + out.position(); |
1456 | | - final long outLimit = address + out.limit(); |
1457 | | - final int inLimit = in.length(); |
1458 | | - if (inLimit > outLimit - outIx) { |
1459 | | - // Not even enough room for an ASCII-encoded string. |
| 1443 | + protected void encodeUtf8Internal(String in, ByteBuffer out) { |
| 1444 | + byte[] bytes = in.getBytes(Internal.UTF_8); |
| 1445 | + try { |
| 1446 | + out.put(bytes); |
| 1447 | + } catch (BufferOverflowException unused) { |
1460 | 1448 | throw new ArrayIndexOutOfBoundsException( |
1461 | 1449 | "Not enough space in output buffer to encode UTF-8 string"); |
1462 | 1450 | } |
1463 | | - |
1464 | | - // Designed to take advantage of |
1465 | | - // https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination |
1466 | | - int inIx = 0; |
1467 | | - for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { |
1468 | | - UnsafeUtil.putByte(outIx++, (byte) c); |
1469 | | - } |
1470 | | - if (inIx == inLimit) { |
1471 | | - // We're done, it was ASCII encoded. |
1472 | | - Java8Compatibility.position(out, (int) (outIx - address)); |
1473 | | - return; |
1474 | | - } |
1475 | | - |
1476 | | - for (char c; inIx < inLimit; ++inIx) { |
1477 | | - c = in.charAt(inIx); |
1478 | | - if (c < 0x80 && outIx < outLimit) { |
1479 | | - UnsafeUtil.putByte(outIx++, (byte) c); |
1480 | | - } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes |
1481 | | - UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); |
1482 | | - UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); |
1483 | | - } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { |
1484 | | - // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes |
1485 | | - UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); |
1486 | | - UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); |
1487 | | - UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c))); |
1488 | | - } else if (outIx <= outLimit - 4L) { |
1489 | | - // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 |
1490 | | - // bytes |
1491 | | - final char low; |
1492 | | - if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { |
1493 | | - throw new UnpairedSurrogateException((inIx - 1), inLimit); |
1494 | | - } |
1495 | | - int codePoint = toCodePoint(c, low); |
1496 | | - UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); |
1497 | | - UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); |
1498 | | - UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); |
1499 | | - UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); |
1500 | | - } else { |
1501 | | - if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) |
1502 | | - && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) { |
1503 | | - // We are surrogates and we're not a surrogate pair. |
1504 | | - throw new UnpairedSurrogateException(inIx, inLimit); |
1505 | | - } |
1506 | | - // Not enough space in the output buffer. |
1507 | | - throw new ArrayIndexOutOfBoundsException( |
1508 | | - "Not enough space in output buffer to encode UTF-8 string"); |
1509 | | - } |
1510 | | - } |
1511 | | - |
1512 | | - // All bytes have been encoded. |
1513 | | - Java8Compatibility.position(out, (int) (outIx - address)); |
1514 | 1451 | } |
1515 | 1452 |
|
1516 | 1453 | /** |
|
0 commit comments