Skip to content

Commit ec86098

Browse files
Change UnsafeProcessor.encodeUtf8(ByteBuffer out) to use the 'naive' strategy that we already used for the (byte[] out) case as a performance improvement.
PiperOrigin-RevId: 826012827
1 parent e830698 commit ec86098

File tree

1 file changed

+73
-136
lines changed
  • java/core/src/main/java/com/google/protobuf

1 file changed

+73
-136
lines changed

java/core/src/main/java/com/google/protobuf/Utf8.java

Lines changed: 73 additions & 136 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import static java.lang.Character.isSurrogatePair;
1919
import static java.lang.Character.toCodePoint;
2020

21+
import java.nio.BufferOverflowException;
2122
import java.nio.ByteBuffer;
2223
import java.util.Arrays;
2324

@@ -748,86 +749,13 @@ final void encodeUtf8(String in, ByteBuffer out) {
748749
final int offset = out.arrayOffset();
749750
int endIndex = Utf8.encode(in, out.array(), offset + out.position(), out.remaining());
750751
Java8Compatibility.position(out, endIndex - offset);
751-
} else if (out.isDirect()) {
752-
encodeUtf8Direct(in, out);
753752
} else {
754-
encodeUtf8Default(in, out);
753+
encodeUtf8Internal(in, out);
755754
}
756755
}
757756

758757
/** Encodes the input character sequence to a direct {@link ByteBuffer} instance. */
759-
abstract void encodeUtf8Direct(String in, ByteBuffer out);
760-
761-
/**
762-
* Encodes the input character sequence to a {@link ByteBuffer} instance using the {@link
763-
* ByteBuffer} API, rather than potentially faster approaches.
764-
*/
765-
final void encodeUtf8Default(String in, ByteBuffer out) {
766-
final int inLength = in.length();
767-
int outIx = out.position();
768-
int inIx = 0;
769-
770-
// Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check
771-
// access. Assume the buffer is big enough and let it handle the out of bounds exception
772-
// if it occurs.
773-
try {
774-
// Designed to take advantage of
775-
// https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
776-
for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) {
777-
out.put(outIx + inIx, (byte) c);
778-
}
779-
if (inIx == inLength) {
780-
// Successfully encoded the entire string.
781-
Java8Compatibility.position(out, outIx + inIx);
782-
return;
783-
}
784-
785-
outIx += inIx;
786-
for (char c; inIx < inLength; ++inIx, ++outIx) {
787-
c = in.charAt(inIx);
788-
if (c < 0x80) {
789-
// One byte (0xxx xxxx)
790-
out.put(outIx, (byte) c);
791-
} else if (c < 0x800) {
792-
// Two bytes (110x xxxx 10xx xxxx)
793-
794-
// Benchmarks show put performs better than putShort here (for HotSpot).
795-
out.put(outIx++, (byte) (0xC0 | (c >>> 6)));
796-
out.put(outIx, (byte) (0x80 | (0x3F & c)));
797-
} else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
798-
// Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
799-
// Maximum single-char code point is 0xFFFF, 16 bits.
800-
801-
// Benchmarks show put performs better than putShort here (for HotSpot).
802-
out.put(outIx++, (byte) (0xE0 | (c >>> 12)));
803-
out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
804-
out.put(outIx, (byte) (0x80 | (0x3F & c)));
805-
} else {
806-
// Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
807-
808-
// Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
809-
// bytes
810-
final char low;
811-
if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
812-
throw new UnpairedSurrogateException(inIx, inLength);
813-
}
814-
// TODO: Consider using putInt() to improve performance.
815-
int codePoint = toCodePoint(c, low);
816-
out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
817-
out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
818-
out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
819-
out.put(outIx, (byte) (0x80 | (0x3F & codePoint)));
820-
}
821-
}
822-
823-
// Successfully encoded the entire string.
824-
Java8Compatibility.position(out, outIx);
825-
} catch (IndexOutOfBoundsException e) {
826-
// TODO: Consider making the API throw IndexOutOfBoundsException instead.
827-
throw new ArrayIndexOutOfBoundsException(
828-
"Not enough space in output buffer to encode UTF-8 string");
829-
}
830-
}
758+
protected abstract void encodeUtf8Internal(String in, ByteBuffer out);
831759
}
832760

833761
/** {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. */
@@ -1062,9 +990,71 @@ int encodeUtf8(String in, byte[] out, int offset, int length) {
1062990
}
1063991

1064992
@Override
1065-
void encodeUtf8Direct(String in, ByteBuffer out) {
1066-
// For safe processing, we have to use the ByteBuffer API.
1067-
encodeUtf8Default(in, out);
993+
protected void encodeUtf8Internal(String in, ByteBuffer out) {
994+
final int inLength = in.length();
995+
int outIx = out.position();
996+
int inIx = 0;
997+
998+
// Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check
999+
// access. Assume the buffer is big enough and let it handle the out of bounds exception
1000+
// if it occurs.
1001+
try {
1002+
// Designed to take advantage of
1003+
// https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
1004+
for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1005+
out.put(outIx + inIx, (byte) c);
1006+
}
1007+
if (inIx == inLength) {
1008+
// Successfully encoded the entire string.
1009+
Java8Compatibility.position(out, outIx + inIx);
1010+
return;
1011+
}
1012+
1013+
outIx += inIx;
1014+
for (char c; inIx < inLength; ++inIx, ++outIx) {
1015+
c = in.charAt(inIx);
1016+
if (c < 0x80) {
1017+
// One byte (0xxx xxxx)
1018+
out.put(outIx, (byte) c);
1019+
} else if (c < 0x800) {
1020+
// Two bytes (110x xxxx 10xx xxxx)
1021+
1022+
// Benchmarks show put performs better than putShort here (for HotSpot).
1023+
out.put(outIx++, (byte) (0xC0 | (c >>> 6)));
1024+
out.put(outIx, (byte) (0x80 | (0x3F & c)));
1025+
} else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
1026+
// Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
1027+
// Maximum single-char code point is 0xFFFF, 16 bits.
1028+
1029+
// Benchmarks show put performs better than putShort here (for HotSpot).
1030+
out.put(outIx++, (byte) (0xE0 | (c >>> 12)));
1031+
out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1032+
out.put(outIx, (byte) (0x80 | (0x3F & c)));
1033+
} else {
1034+
// Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
1035+
1036+
// Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
1037+
// bytes
1038+
final char low;
1039+
if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
1040+
throw new UnpairedSurrogateException(inIx, inLength);
1041+
}
1042+
// TODO: Consider using putInt() to improve performance.
1043+
int codePoint = toCodePoint(c, low);
1044+
out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
1045+
out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
1046+
out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
1047+
out.put(outIx, (byte) (0x80 | (0x3F & codePoint)));
1048+
}
1049+
}
1050+
1051+
// Successfully encoded the entire string.
1052+
Java8Compatibility.position(out, outIx);
1053+
} catch (IndexOutOfBoundsException unused) {
1054+
// TODO: Consider making the API throw IndexOutOfBoundsException instead.
1055+
throw new ArrayIndexOutOfBoundsException(
1056+
"Not enough space in output buffer to encode UTF-8 string");
1057+
}
10681058
}
10691059

10701060
private static int partialIsValidUtf8(byte[] bytes, int index, int limit) {
@@ -1450,67 +1440,14 @@ int encodeUtf8(final String in, final byte[] out, final int offset, final int le
14501440
}
14511441

14521442
@Override
1453-
void encodeUtf8Direct(String in, ByteBuffer out) {
1454-
final long address = addressOffset(out);
1455-
long outIx = address + out.position();
1456-
final long outLimit = address + out.limit();
1457-
final int inLimit = in.length();
1458-
if (inLimit > outLimit - outIx) {
1459-
// Not even enough room for an ASCII-encoded string.
1443+
protected void encodeUtf8Internal(String in, ByteBuffer out) {
1444+
byte[] bytes = in.getBytes(Internal.UTF_8);
1445+
try {
1446+
out.put(bytes);
1447+
} catch (BufferOverflowException unused) {
14601448
throw new ArrayIndexOutOfBoundsException(
14611449
"Not enough space in output buffer to encode UTF-8 string");
14621450
}
1463-
1464-
// Designed to take advantage of
1465-
// https://wiki.openjdk.java.net/display/HotSpotInternals/RangeCheckElimination
1466-
int inIx = 0;
1467-
for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1468-
UnsafeUtil.putByte(outIx++, (byte) c);
1469-
}
1470-
if (inIx == inLimit) {
1471-
// We're done, it was ASCII encoded.
1472-
Java8Compatibility.position(out, (int) (outIx - address));
1473-
return;
1474-
}
1475-
1476-
for (char c; inIx < inLimit; ++inIx) {
1477-
c = in.charAt(inIx);
1478-
if (c < 0x80 && outIx < outLimit) {
1479-
UnsafeUtil.putByte(outIx++, (byte) c);
1480-
} else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1481-
UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1482-
UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1483-
} else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1484-
// Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
1485-
UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1486-
UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1487-
UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1488-
} else if (outIx <= outLimit - 4L) {
1489-
// Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
1490-
// bytes
1491-
final char low;
1492-
if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
1493-
throw new UnpairedSurrogateException((inIx - 1), inLimit);
1494-
}
1495-
int codePoint = toCodePoint(c, low);
1496-
UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
1497-
UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
1498-
UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
1499-
UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint)));
1500-
} else {
1501-
if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1502-
&& (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) {
1503-
// We are surrogates and we're not a surrogate pair.
1504-
throw new UnpairedSurrogateException(inIx, inLimit);
1505-
}
1506-
// Not enough space in the output buffer.
1507-
throw new ArrayIndexOutOfBoundsException(
1508-
"Not enough space in output buffer to encode UTF-8 string");
1509-
}
1510-
}
1511-
1512-
// All bytes have been encoded.
1513-
Java8Compatibility.position(out, (int) (outIx - address));
15141451
}
15151452

15161453
/**

0 commit comments

Comments
 (0)