diff --git a/ycmd/completers/language_server/language_server_protocol.py b/ycmd/completers/language_server/language_server_protocol.py index 365f8a04c5..5a58a17124 100644 --- a/ycmd/completers/language_server/language_server_protocol.py +++ b/ycmd/completers/language_server/language_server_protocol.py @@ -823,6 +823,13 @@ def UTF16CodeUnitsToCodepoints( line_value, code_unit_offset ): # is one-past-the-end of the string in unicode codepoints return len( line_value ) + 1 + if byte_offset_utf16 < len( value_as_utf16_bytes ): + hi_byte_at_offset = value_as_utf16_bytes[ byte_offset_utf16 + 1 ] + if 0xdc <= hi_byte_at_offset <= 0xdf: + # If we are in the middle of a surrogate pair, then advance the offset + # further to skip the pair + code_unit_offset += 1 + bytes_included = value_as_utf16_bytes[ : code_unit_offset * 2 ] return len( bytes_included.decode( 'utf-16-le' ) ) diff --git a/ycmd/tests/language_server/language_server_protocol_test.py b/ycmd/tests/language_server/language_server_protocol_test.py index 15c91a0001..c9378271bf 100644 --- a/ycmd/tests/language_server/language_server_protocol_test.py +++ b/ycmd/tests/language_server/language_server_protocol_test.py @@ -204,3 +204,14 @@ def test_CodepointsToUTF16CodeUnitsAndReverse( self ): equal_to( code_units ) ) assert_that( lsp.UTF16CodeUnitsToCodepoints( line_value, code_units ), equal_to( codepoints ) ) + + def test_CodepointsToUTF16CodeUnitsSurrogate( self ): + for line_value, codepoints, code_units in [ + ( '😉', 1, 1 ), + ( 'f😉', 2, 2 ), + ]: + with self.subTest( line_value = line_value, + codepoints = codepoints, + code_units = code_units ): + assert_that( lsp.UTF16CodeUnitsToCodepoints( line_value, code_units ), + equal_to( codepoints ) )