63#include <boost/locale.hpp>
76 return aCharacter >= 0xD800 && aCharacter <= 0xDBFF;
80 return aCharacter >= 0xDC00 && aCharacter <= 0xDFFF;
90 if (aCharacter <= 0x7F)
92 aString.append(1,
static_cast<char>(aCharacter));
95 else if (aCharacter <= 0x7FF)
97 aString.append(1,
static_cast<char>(((aCharacter >> 6) & 0x1F) | 0xC0));
98 aString.append(1,
static_cast<char>((aCharacter & 0x3F) | 0x80));
101 else if (aCharacter <= 0xFFFF)
103 aString.append(1,
static_cast<char>(((aCharacter >> 12) & 0x0F) | 0xE0));
104 aString.append(1,
static_cast<char>(((aCharacter >> 6 ) & 0x3F) | 0x80));
105 aString.append(1,
static_cast<char>((aCharacter& 0x3F) | 0x80));
108 else if (aCharacter <= 0x10FFFF)
110 aString.append(1,
static_cast<char>(((aCharacter >> 18) & 0x07) | 0xF0));
111 aString.append(1,
static_cast<char>(((aCharacter >> 12 ) & 0x3F) | 0x80));
112 aString.append(1,
static_cast<char>(((aCharacter >> 6 ) & 0x3F) | 0x80));
113 aString.append(1,
static_cast<char>((aCharacter& 0x3F) | 0x80));
132 void operator()(std::u16string::size_type aFrom,
bool aSurrogatePair,
const std::string& aNarrowString, std::string::size_type aNumberAdded)
134 for (std::string::size_type i = 0; i < aNumberAdded; ++i)
135 iCharMap[aNarrowString.size() - aNumberAdded + i] = aFrom;
136 if (aSurrogatePair && aNarrowString.size())
138 if (aNarrowString.size() <= 1)
140 iCharMap[aNarrowString.size() - 1] = aFrom + 1;
147 void operator()(std::u16string::size_type,
bool,
const std::string&, std::string::size_type) {}
151 template <
bool AllowUpper128,
typename CharacterMapUpdater>
152 inline std::string
utf16_to_utf8(
const std::u16string& aString, CharacterMapUpdater aCharacterMapUpdater)
154 bool previousWasUtf8Prefix =
false;
155 std::string narrowString;
156 std::u16string::size_type from = 0;
157 for (std::u16string::const_iterator i = aString.begin(); i != aString.end(); from = i - aString.begin())
159 bool sequenceCheck = previousWasUtf8Prefix;
160 previousWasUtf8Prefix =
false;
162 bool surrogatePair =
false;
165 uch = ((uch & 0x3FF) << 10);
166 uch = uch | (*i++ & 0x3FF);
168 surrogatePair =
true;
170 else if (AllowUpper128)
172 int narrowChar = wctob(
static_cast<wint_t
>(uch));
173 if (narrowChar !=
static_cast<int>(EOF) && narrowChar !=
static_cast<int>(WEOF) &&
static_cast<unsigned char>(narrowChar) > 0x7Fu)
175 unsigned char nch =
static_cast<unsigned char>(narrowChar);
176 if ((nch & 0xE0) == 0xC0 || (nch & 0xF0) == 0xE0 || (nch & 0xF8) == 0xF0)
178 previousWasUtf8Prefix =
true;
180 else if (sequenceCheck && (nch & 0xC0) == 0x80)
182 int previousNarrowChar =
static_cast<int>(narrowString[narrowString.size()-1]);
183 narrowString.erase(narrowString.size() - 1);
184 aCharacterMapUpdater(from, surrogatePair, narrowString,
append_utf8(narrowString,
static_cast<unicode_char_t>(btowc(previousNarrowChar))));
186 narrowString.append(1,
static_cast<char>(narrowChar));
187 aCharacterMapUpdater(from, surrogatePair, narrowString, 1);
191 aCharacterMapUpdater(from, surrogatePair, narrowString,
append_utf8(narrowString, uch));
196 template <
bool AllowUpper128>
202 template <
bool AllowUpper128>
210 return utf16_to_utf8<false>(aString);
215 return utf16_to_utf8<false>(aString, aCharMap);
220 template <
typename FwdIter>
224 FwdIter start = aCurrent;
225 for (std::size_t i = 0; i != aCount; ++i)
228 if (aCurrent == aEnd)
233 unsigned char nch =
static_cast<unsigned char>(*aCurrent);
234 if (nch == 0xC0 || nch == 0xC1)
239 if ((nch & 0xC0) == 0x80)
240 unicodeChar = (unicodeChar << 6) | (static_cast<unicode_char_t>(nch & ~0xC0));
247 static const unicode_char_t sMaxCodePoint[] = { 0x7F, 0x7FF, 0xFFFF, 0x10FFFF };
248 if (unicodeChar <= sMaxCodePoint[aCount - 1])
259 template <
typename Callback>
260 inline std::u16string
utf8_to_utf16(
const std::string& aString, Callback aCallback,
bool aCodePageFallback =
false)
262 std::u16string utf16String;
263 for (std::string::const_iterator i = aString.begin(); i != aString.end();)
265 aCallback(i - aString.begin(), utf16String.size());
266 unsigned char nch =
static_cast<unsigned char>(*i);
268 if ((nch & 0x80) == 0)
272 std::string::const_iterator old = i;
273 if (nch == 0xC0 || nch == 0xC1)
275 else if ((nch & 0xE0) == 0xC0)
277 else if ((nch & 0xF0) == 0xE0)
279 else if ((nch & 0xF8) == 0xF0)
283 if (i == old && aCodePageFallback)
286 std::mbstate_t state = std::mbstate_t{};
287 if (std::mbrtowc(&wch,
reinterpret_cast<char*
>(&nch), 1, &state) == 1)
295 utf16String.append(1,
static_cast<char16_t>(uch));
300 utf16String.append(1,
static_cast<char16_t>(0xd800|(uch >> 10)));
301 utf16String.append(1,
static_cast<char16_t>(0xdc00|(uch & 0x3FF)));
304 if (i != aString.end())
310 inline std::u16string
utf8_to_utf16(
const std::string& aString,
bool aCodePageFallback =
false)
315 template <
typename Callback>
316 inline std::u32string
utf8_to_utf32(std::string_view
const& aStringView, Callback aCallback,
bool aCodePageFallback =
false)
318 auto begin = aStringView.begin();
319 auto end = aStringView.end();
320 std::u32string utf32String;
321 for (
auto i = begin; i != end; ++i)
323 aCallback(i - begin, utf32String.size());
325 unsigned char nch =
static_cast<unsigned char>(*i);
327 if ((nch & 0x80) == 0)
332 if (nch == 0xC0 || nch == 0xC1)
334 else if ((nch & 0xE0) == 0xC0)
336 else if ((nch & 0xF0) == 0xE0)
338 else if ((nch & 0xF8) == 0xF0)
342 if (i == old && aCodePageFallback)
345 std::mbstate_t state = std::mbstate_t{};
346 if (std::mbrtoc32(&ch32,
reinterpret_cast<char*
>(&nch), 1, &state) == 1)
353 utf32String.append(1, uch);
358 template <
typename Callback>
359 inline std::u32string
utf8_to_utf32(std::string::const_iterator aBegin, std::string::const_iterator aEnd, Callback aCallback,
bool aCodePageFallback =
false)
363 return std::u32string{};
364 return utf8_to_utf32(std::string_view{ &*aBegin,
static_cast<std::string_view::size_type
>(
std::distance(aBegin, aEnd)) }, aCallback, aCodePageFallback);
367 template <
typename Callback>
368 inline std::u32string
utf8_to_utf32(
const std::string& aString, Callback aCallback,
bool aCodePageFallback =
false)
370 return utf8_to_utf32(aString.begin(), aString.end(), aCallback, aCodePageFallback);
373 inline std::u32string
utf8_to_utf32(std::string::const_iterator aBegin, std::string::const_iterator aEnd,
bool aCodePageFallback =
false)
378 inline std::u32string
utf8_to_utf32(
const std::string& aString,
bool aCodePageFallback =
false)
380 return utf8_to_utf32(aString.begin(), aString.end(), aCodePageFallback);
383 inline std::u32string
utf8_to_utf32(
const std::string_view& aStringView,
bool aCodePageFallback =
false)
391 for (
auto ch : aString)
398 return (aCharacter & 0xC0) == 0x80;
401 template <
typename CharT,
typename Traits>
402 inline bool check_utf8(
const std::basic_string_view<CharT, Traits>& aString)
404 auto end = aString.end();
405 for (
auto i = aString.begin(); i != end; ++i)
407 unsigned char nch =
static_cast<unsigned char>(*i);
409 if ((nch & 0x80) == 0)
414 if (nch == 0xC0 || nch == 0xC1)
416 else if ((nch & 0xE0) == 0xC0)
418 else if ((nch & 0xF0) == 0xE0)
420 else if ((nch & 0xF8) == 0xF0)
431 template <
typename CharT,
typename Traits,
typename Alloc>
432 inline bool check_utf8(
const std::basic_string<CharT, Traits, Alloc>& aString)
434 return check_utf8(std::basic_string_view<CharT, Traits>{ aString });
442 template <
typename StringT>
454 template <
typename StringT>
455 inline StringT
utf8_to_any(
const std::string& aString,
bool aCodePageFallback =
false)
466 inline std::u16string
any_to_utf16(
const std::string& aString,
bool aCodePageFallback =
false)
481 inline const std::u16string&
any_to_utf16(
const std::u16string& aString)
486 template <
typename StringT>
490 any_to_utf16_result(
const typename StringT::value_type* aString,
typename StringT::size_type aStringLength,
bool aCodePageFallback =
false) :
491 iString(
utf8_to_utf16(StringT(aString, aStringLength), aCodePageFallback))
497 return iString.data();
501 return iString.length();
504 const std::u16string iString;
513 iStringLength(aStringLength)
523 return iStringLength;
526 const char16_t* iString;
527 std::u16string::size_type iStringLength;
540 template <
typename CharT,
typename Traits,
typename Alloc>
541 inline std::string
utf16_to_narrow(
const std::basic_string<CharT, Traits, Alloc>& aWideString)
543 std::vector<char> narrowString;
544 narrowString.resize(aWideString.size() + 1);
545 wcstombs(&narrowString[0], aWideString.c_str(), aWideString.size() + 1);
546 return std::string(&narrowString[0]);
549 template <
typename CharT,
typename Traits,
typename Alloc>
550 inline std::u16string
narrow_to_utf16(
const std::basic_string<CharT, Traits, Alloc>& aNarrowString)
552 std::vector<char16_t> utf16String;
553 utf16String.resize(aNarrowString.size() + 1);
554 mbstowcs(&utf16String[0], aNarrowString.c_str(), aNarrowString.size() + 1);
555 return std::u16string(&utf16String[0]);
any_to_utf16_result(const char16_t *aString, std::u16string::size_type aStringLength)
const char16_t * data() const
std::u16string::size_type length() const
std::u16string::size_type length() const
const char16_t * data() const
any_to_utf16_result(const typename StringT::value_type *aString, typename StringT::size_type aStringLength, bool aCodePageFallback=false)
std::string_view to_std_string_view() const noexcept
void default_utf16_conversion_callback(std::string::size_type, std::u16string::size_type)
unicode_char_t next_utf_bits(unicode_char_t aUnicodeChar, std::size_t aCount, FwdIter &aCurrent, FwdIter aEnd)
void default_utf32_conversion_callback(std::string::size_type, std::u32string::size_type)
bool is_surrogate_pair(unicode_char_t aHighValue, unicode_char_t aLowValue)
bool is_low_surrogate(unicode_char_t aCharacter)
bool is_high_surrogate(unicode_char_t aCharacter)
std::map< std::string::size_type, std::u16string::size_type > utf16_to_utf8_character_map
StringT utf8_to_any(const std::string &aString, bool aCodePageFallback=false)
const unicode_char_t INVALID_CHAR32
bool is_utf8_trailing(char aCharacter)
bool check_utf8(const std::basic_string_view< CharT, Traits > &aString)
std::string utf32_to_utf8(const std::u32string &aString)
std::u16string narrow_to_utf16(const std::basic_string< CharT, Traits, Alloc > &aNarrowString)
const std::string & any_to_utf8(const std::string &aString)
std::string utf16_to_narrow(const std::basic_string< CharT, Traits, Alloc > &aWideString)
std::string utf16_to_utf8(const std::u16string &aString, CharacterMapUpdater aCharacterMapUpdater)
std::u16string any_to_utf16(const std::string &aString, bool aCodePageFallback=false)
StringT utf16_to_any(const std::u16string &aString)
std::size_t append_utf8(std::string &aString, unicode_char_t aCharacter)
std::u32string utf8_to_utf32(std::string_view const &aStringView, Callback aCallback, bool aCodePageFallback=false)
std::string utf8_to_any< std::string >(const std::string &aString, bool)
std::u16string utf16_to_any< std::u16string >(const std::u16string &aString)
std::u16string utf8_to_utf16(const std::string &aString, Callback aCallback, bool aCodePageFallback=false)
iterator_traits< it_type >::difference_type distance(const it_type first, const it_type last)
character_map_updater(utf16_to_utf8_character_map &aCharMap)
void operator()(std::u16string::size_type aFrom, bool aSurrogatePair, const std::string &aNarrowString, std::string::size_type aNumberAdded)
utf16_to_utf8_character_map & iCharMap
void operator()(std::u16string::size_type, bool, const std::string &, std::string::size_type)
no_character_map_updater()