Skip to content

Commit e37bbc0

Browse files
Merge pull request #49 from contour-terminal/feature/unicode-query
Adding new tool unicode-query to deprecate uc-inspect.
2 parents 2ced48a + 70b7acf commit e37bbc0

File tree

2 files changed

+156
-3
lines changed

2 files changed

+156
-3
lines changed

src/tools/CMakeLists.txt

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
if(LIBUNICODE_TOOLS)
2-
add_executable(uc-inspect uc-inspect.cpp)
3-
target_link_libraries(uc-inspect fmt::fmt-header-only unicode)
4-
install(TARGETS uc-inspect DESTINATION bin)
2+
# This tool is going to be replaced by unicode-query,
3+
# So disable it early to not be included in the release.
4+
#
5+
# add_executable(uc-inspect uc-inspect.cpp)
6+
# target_link_libraries(uc-inspect fmt::fmt-header-only unicode)
7+
# install(TARGETS uc-inspect DESTINATION bin)
8+
9+
add_executable(unicode-query unicode-query.cpp)
10+
target_link_libraries(unicode-query fmt::fmt-header-only unicode)
11+
install(TARGETS unicode-query DESTINATION bin)
512
endif()

src/tools/unicode-query.cpp

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
/**
2+
* This file is part of the "libunicode" project
3+
* Copyright (c) 2022 Christian Parpart <[email protected]>
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
15+
#include <unicode/codepoint_properties_loader.h>
16+
#include <unicode/convert.h>
17+
#include <unicode/grapheme_segmenter.h>
18+
#include <unicode/ucd.h>
19+
#include <unicode/ucd_enums.h>
20+
#include <unicode/ucd_fmt.h>
21+
#include <unicode/utf8_grapheme_segmenter.h>
22+
23+
#include <fmt/format.h>
24+
25+
#include <charconv>
26+
#include <iostream>
27+
#include <optional>
28+
#include <string>
29+
30+
using namespace std;
31+
32+
namespace
33+
{
34+
35+
std::string quotedAndEscaped(std::string const& text)
36+
{
37+
auto result = "\""s;
38+
for (char const ch: text)
39+
{
40+
if (std::isprint(ch) && ch != '"')
41+
result += ch;
42+
else
43+
result += fmt::format("\\x{:02X}", uint8_t(ch));
44+
}
45+
result += "\"";
46+
return result;
47+
}
48+
49+
int printUsage(int exitCode)
50+
{
51+
cout << "unicode-query [properties] U+XXXX [...]\n";
52+
return exitCode;
53+
}
54+
55+
optional<char32_t> parseChar(std::string_view text)
56+
{
57+
if (text.size() >= 3 && text[0] == 'U' && text[1] == '+')
58+
text.remove_prefix(2);
59+
60+
auto value = uint32_t {};
61+
auto const result = std::from_chars(text.data(), text.data() + text.size(), value, 16);
62+
if (result.ptr != text.data() + text.size())
63+
return nullopt;
64+
return { static_cast<char32_t>(value) };
65+
}
66+
67+
vector<char32_t> parseChars(std::string_view text)
68+
{
69+
if (text.size() >= 3 && text[0] == 'U' && text[1] == '+')
70+
{
71+
text.remove_prefix(2);
72+
if (auto const parsedChar = parseChar(text); parsedChar.has_value())
73+
return { parsedChar.value() };
74+
else
75+
return {}; // error
76+
}
77+
78+
auto parsedChars = vector<char32_t> {};
79+
80+
for (char32_t const ch: unicode::from_utf8(text))
81+
parsedChars.emplace_back(ch);
82+
83+
return parsedChars;
84+
}
85+
86+
void showCodepointProperties(char32_t codepoint)
87+
{
88+
auto const properties = unicode::codepoint_properties::get(codepoint);
89+
90+
// clang-format off
91+
cout << fmt::format("Codepoint : U+{:X}\n", uint32_t(codepoint));
92+
cout << fmt::format("UTF-8 : {}\n", quotedAndEscaped(unicode::convert_to<char>(codepoint)));
93+
if (properties.general_category != unicode::General_Category::Control)
94+
cout << fmt::format("Display : {}\n", unicode::convert_to<char>(codepoint));
95+
cout << fmt::format("Plane : {}\n", unicode::plane(codepoint));
96+
cout << fmt::format("Block : {}\n", unicode::block(codepoint));
97+
cout << fmt::format("Script : {}\n", unicode::script(codepoint));
98+
cout << fmt::format("General Category : {}\n", properties.general_category);
99+
cout << fmt::format("East Asian Width : {}\n", properties.east_asian_width);
100+
cout << fmt::format("Character width : {}\n", properties.char_width);
101+
cout << fmt::format("Emoji Segmentation Category : {}\n", properties.emoji_segmentation_category);
102+
cout << fmt::format("Grapheme Cluster Break : {}\n", properties.grapheme_cluster_break);
103+
cout << "\n";
104+
// clang-format off
105+
}
106+
107+
int showCodepointProperties(int argc, char const* argv[])
108+
{
109+
int arg = 0;
110+
while (arg < argc)
111+
{
112+
auto const codepoints = parseChars(argv[arg]);
113+
if (codepoints.empty())
114+
cerr << "Failed to parse codepoint " << argv[arg] << "\n";
115+
else
116+
for (auto const codepoint: codepoints)
117+
showCodepointProperties(codepoint);
118+
++arg;
119+
}
120+
return EXIT_SUCCESS;
121+
}
122+
123+
} // namespace
124+
125+
// Example usage:
126+
//
127+
// unicode-query [properties] U+1234 [U+5678 ...]
128+
//
129+
// unicode-query analyze "Text string"
130+
//
131+
// Analyzes the given input string for common Unicode properties
132+
// and prints out each segments with the gathered information.
133+
int main(int argc, char const* argv[])
134+
{
135+
if (argc == 1)
136+
return printUsage(EXIT_FAILURE);
137+
138+
int argIndex = 1;
139+
if (string_view(argv[argIndex]) == "help")
140+
return printUsage(EXIT_SUCCESS);
141+
142+
if (string_view(argv[argIndex]) == "properties")
143+
++argIndex;
144+
145+
return showCodepointProperties(argc - argIndex, argv + argIndex);
146+
}

0 commit comments

Comments
 (0)