mirror of
https://github.com/monero-project/monero.git
synced 2025-01-09 20:40:11 +00:00
epee: support unicode in parsed strings
This commit is contained in:
parent
310c26824d
commit
eeca5ca0c8
3 changed files with 78 additions and 22 deletions
|
@ -31,6 +31,9 @@
|
|||
#include <algorithm>
|
||||
#include <boost/utility/string_ref.hpp>
|
||||
|
||||
#undef MONERO_DEFAULT_LOG_CATEGORY
|
||||
#define MONERO_DEFAULT_LOG_CATEGORY "serialization"
|
||||
|
||||
namespace epee
|
||||
{
|
||||
namespace misc_utils
|
||||
|
@ -62,6 +65,26 @@ namespace misc_utils
|
|||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
};
|
||||
|
||||
static const constexpr unsigned char isx[256] =
|
||||
{
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 10, 11, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 10, 11, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
};
|
||||
|
||||
inline bool isspace(char c)
|
||||
{
|
||||
return lut[(uint8_t)c] & 8;
|
||||
|
@ -162,6 +185,42 @@ namespace misc_utils
|
|||
val.push_back('\\');break;
|
||||
case '/': //Slash character
|
||||
val.push_back('/');break;
|
||||
case 'u': //Unicode code point
|
||||
if (buf_end - it < 4)
|
||||
{
|
||||
ASSERT_MES_AND_THROW("Invalid Unicode escape sequence");
|
||||
}
|
||||
else
|
||||
{
|
||||
uint32_t dst = 0;
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
const unsigned char tmp = isx[(int)*++it];
|
||||
CHECK_AND_ASSERT_THROW_MES(tmp != 0xff, "Bad Unicode encoding");
|
||||
dst = dst << 4 | tmp;
|
||||
}
|
||||
// encode as UTF-8
|
||||
if (dst <= 0x7f)
|
||||
{
|
||||
val.push_back(dst);
|
||||
}
|
||||
else if (dst <= 0x7ff)
|
||||
{
|
||||
val.push_back(0xc0 | (dst >> 6));
|
||||
val.push_back(0x80 | (dst & 0x3f));
|
||||
}
|
||||
else if (dst <= 0xffff)
|
||||
{
|
||||
val.push_back(0xe0 | (dst >> 12));
|
||||
val.push_back(0x80 | ((dst >> 6) & 0x3f));
|
||||
val.push_back(0x80 | (dst & 0x3f));
|
||||
}
|
||||
else
|
||||
{
|
||||
ASSERT_MES_AND_THROW("Unicode code point is out or range");
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
val.push_back(*it);
|
||||
LOG_PRINT_L0("Unknown escape sequence :\"\\" << *it << "\"");
|
||||
|
|
|
@ -59,26 +59,6 @@
|
|||
#pragma comment (lib, "Rpcrt4.lib")
|
||||
#endif
|
||||
|
||||
static const constexpr unsigned char isx[256] =
|
||||
{
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 10, 11, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 10, 11, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
|
||||
};
|
||||
|
||||
namespace epee
|
||||
{
|
||||
namespace string_tools
|
||||
|
@ -99,10 +79,10 @@ namespace string_tools
|
|||
for(size_t i = 0; i < s.size(); i += 2)
|
||||
{
|
||||
int tmp = *src++;
|
||||
tmp = isx[tmp];
|
||||
tmp = epee::misc_utils::parse::isx[tmp];
|
||||
if (tmp == 0xff) return false;
|
||||
int t2 = *src++;
|
||||
t2 = isx[t2];
|
||||
t2 = epee::misc_utils::parse::isx[t2];
|
||||
if (t2 == 0xff) return false;
|
||||
*dst++ = (tmp << 4) | t2;
|
||||
}
|
||||
|
|
|
@ -946,3 +946,20 @@ TEST(parsing, number)
|
|||
epee::misc_utils::parse::match_number(i, s.end(), val);
|
||||
ASSERT_EQ(val, "+9.34e+03");
|
||||
}
|
||||
|
||||
TEST(parsing, unicode)
|
||||
{
|
||||
std::string bs;
|
||||
std::string s;
|
||||
std::string::const_iterator si;
|
||||
|
||||
s = "\"\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, "");
|
||||
s = "\"\\u0000\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, std::string(1, '\0'));
|
||||
s = "\"\\u0020\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, " ");
|
||||
s = "\"\\u1\""; si = s.begin(); ASSERT_FALSE(epee::misc_utils::parse::match_string(si, s.end(), bs));
|
||||
s = "\"\\u12\""; si = s.begin(); ASSERT_FALSE(epee::misc_utils::parse::match_string(si, s.end(), bs));
|
||||
s = "\"\\u123\""; si = s.begin(); ASSERT_FALSE(epee::misc_utils::parse::match_string(si, s.end(), bs));
|
||||
s = "\"\\u1234\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, "ሴ");
|
||||
s = "\"foo\\u1234bar\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, "fooሴbar");
|
||||
s = "\"\\u3042\\u307e\\u3084\\u304b\\u3059\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, "あまやかす");
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue