Skip to content

Commit 72d896f

Browse files
Move width() into codepoint_properties.char_width.
1 parent e586d17 commit 72d896f

File tree

5 files changed

+58
-45
lines changed

5 files changed

+58
-45
lines changed

src/unicode/codepoint_properties.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,14 @@ namespace unicode
2626

2727
struct LIBUNICODE_PACKED codepoint_properties
2828
{
29+
uint8_t char_width = 0;
30+
uint8_t flags = 0;
2931
Script script = Script::Unknown;
3032
Grapheme_Cluster_Break grapheme_cluster_break = Grapheme_Cluster_Break::Other;
3133
East_Asian_Width east_asian_width = East_Asian_Width::Narrow;
3234
General_Category general_category = General_Category::Unassigned;
3335
EmojiSegmentationCategory emoji_segmentation_category = EmojiSegmentationCategory::Invalid;
34-
uint8_t flags = 0;
35-
uint16_t pad = 0;
36+
uint8_t pad = 0;
3637

3738
static uint8_t constexpr FlagEmoji = 0x01; // NOLINT(readability-identifier-naming)
3839
static uint8_t constexpr FlagEmojiPresentation = 0x02; // NOLINT(readability-identifier-naming)

src/unicode/codepoint_properties_loader.cpp

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,46 @@ namespace
360360
}
361361
};
362362

363+
inline uint8_t toCharWidth(codepoint_properties const& properties) noexcept
364+
{
365+
switch (properties.general_category)
366+
{
367+
case General_Category::Control: // XXX really?
368+
case General_Category::Enclosing_Mark:
369+
case General_Category::Format:
370+
case General_Category::Line_Separator:
371+
// case General_Category::Modifier_Symbol:
372+
case General_Category::Nonspacing_Mark:
373+
case General_Category::Paragraph_Separator:
374+
case General_Category::Spacing_Mark:
375+
case General_Category::Surrogate: return 0;
376+
default: break;
377+
}
378+
379+
if (properties.emoji_presentation())
380+
// UAX #11 §5 Recommendations:
381+
// [UTS51] emoji presentation sequences behave as though they were East Asian Wide,
382+
// regardless of their assigned East_Asian_Width property value.
383+
return 2;
384+
385+
switch (properties.east_asian_width)
386+
{
387+
case East_Asian_Width::Narrow:
388+
case East_Asian_Width::Ambiguous:
389+
case East_Asian_Width::Halfwidth:
390+
case East_Asian_Width::Neutral:
391+
//.
392+
return 1;
393+
case East_Asian_Width::Wide:
394+
case East_Asian_Width::Fullwidth:
395+
//.
396+
return 2;
397+
}
398+
399+
// Should never be reached.
400+
return 1;
401+
}
402+
363403
inline EmojiSegmentationCategory toEmojiSegmentationCategory(char32_t codepoint,
364404
codepoint_properties const& props) noexcept
365405
{
@@ -397,6 +437,7 @@ namespace
397437

398438
return EmojiSegmentationCategory::Invalid;
399439
}
440+
400441
class codepoint_properties_loader
401442
{
402443
public:
@@ -493,7 +534,6 @@ namespace
493534
properties(codepoint).flags |= i->second;
494535
});
495536

496-
497537
process_properties("extracted/DerivedGeneralCategory.txt",
498538
[&](char32_t codepoint, string_view value) {
499539
(void) codepoint;
@@ -552,6 +592,14 @@ namespace
552592
toEmojiSegmentationCategory(codepoint, properties(codepoint));
553593
}
554594
// }}}
595+
596+
// {{{ assign char_width
597+
{
598+
auto const _ = scoped_timer { _log, "Assigning char_width" };
599+
for (char32_t codepoint = 0; codepoint < 0x110'000; ++codepoint)
600+
properties(codepoint).char_width = toCharWidth(properties(codepoint));
601+
}
602+
// }}}
555603
}
556604

557605
codepoint_properties_table codepoint_properties_loader::load_from_directory(

src/unicode/tablegen.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,13 @@ void generate_cxx_properties_table(std::ostream& header,
8181
// clang-format off
8282
auto const& properties = propertiesTable[i];
8383
implementation << " {"
84+
<< static_cast<unsigned>(properties.char_width) << ", "
85+
<< (!properties.flags ? "0"s : fmt::format("0b{:08b}", properties.flags)) << ", "
8486
<< fmt::format("Script::{}, ", properties.script)
8587
<< fmt::format("Grapheme_Cluster_Break::{}, ", properties.grapheme_cluster_break)
8688
<< fmt::format("East_Asian_Width::{}, ", properties.east_asian_width)
8789
<< fmt::format("General_Category::{}, ", properties.general_category)
88-
<< fmt::format("EmojiSegmentationCategory::{}, ", properties.emoji_segmentation_category)
89-
<< (!properties.flags ? "0"s : fmt::format("0b{:08b}", properties.flags))
90+
<< fmt::format("EmojiSegmentationCategory::{} ", properties.emoji_segmentation_category)
9091
<< "},\n";
9192
// clang-format on
9293
}

src/unicode/width.cpp

Lines changed: 2 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -18,46 +18,9 @@
1818
namespace unicode
1919
{
2020

21-
int width(char32_t codepoint)
21+
int width(char32_t codepoint) noexcept
2222
{
23-
// Small optimization to speadup US-ASCII width calculation.
24-
if (0x20 <= codepoint && codepoint <= 0xA0)
25-
return 1;
26-
27-
// TODO: make this at most one lookup
28-
auto const& properties = codepoint_properties::get(codepoint);
29-
30-
switch (properties.general_category)
31-
{
32-
case General_Category::Control: // XXX really?
33-
case General_Category::Enclosing_Mark:
34-
case General_Category::Format:
35-
case General_Category::Line_Separator:
36-
// case General_Category::Modifier_Symbol:
37-
case General_Category::Nonspacing_Mark:
38-
case General_Category::Paragraph_Separator:
39-
case General_Category::Spacing_Mark:
40-
case General_Category::Surrogate: return 0;
41-
default: break;
42-
}
43-
44-
if (properties.emoji_presentation())
45-
// UAX #11 §5 Recommendations:
46-
// [UTS51] emoji presentation sequences behave as though they were East Asian Wide,
47-
// regardless of their assigned East_Asian_Width property value.
48-
return 2;
49-
50-
switch (properties.east_asian_width)
51-
{
52-
case East_Asian_Width::Narrow:
53-
case East_Asian_Width::Ambiguous:
54-
case East_Asian_Width::Halfwidth:
55-
case East_Asian_Width::Neutral: return 1;
56-
case East_Asian_Width::Wide:
57-
case East_Asian_Width::Fullwidth: return 2;
58-
}
59-
60-
return 1;
23+
return codepoint_properties::get(codepoint).char_width;
6124
}
6225

6326
} // namespace unicode

src/unicode/width.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,6 @@ namespace unicode
1717
{
1818

1919
/// Returns the number of text columns the given codepoint would need to be displayed.
20-
int width(char32_t codepoint);
20+
int width(char32_t codepoint) noexcept;
2121

2222
} // namespace unicode

0 commit comments

Comments
 (0)