epee: support unicode in parsed strings

This commit is contained in:
moneromooo-monero 2019-04-30 19:16:11 +00:00
parent 310c26824d
commit eeca5ca0c8
No known key found for this signature in database
GPG key ID: 686F07454D6CEFC3
3 changed files with 78 additions and 22 deletions

View file

@ -31,6 +31,9 @@
#include <algorithm> #include <algorithm>
#include <boost/utility/string_ref.hpp> #include <boost/utility/string_ref.hpp>
#undef MONERO_DEFAULT_LOG_CATEGORY
#define MONERO_DEFAULT_LOG_CATEGORY "serialization"
namespace epee namespace epee
{ {
namespace misc_utils namespace misc_utils
@ -62,6 +65,26 @@ namespace misc_utils
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
}; };
static const constexpr unsigned char isx[256] =
{
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 10, 11, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 10, 11, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
};
inline bool isspace(char c) inline bool isspace(char c)
{ {
return lut[(uint8_t)c] & 8; return lut[(uint8_t)c] & 8;
@ -162,6 +185,42 @@ namespace misc_utils
val.push_back('\\');break; val.push_back('\\');break;
case '/': //Slash character case '/': //Slash character
val.push_back('/');break; val.push_back('/');break;
case 'u': //Unicode code point
if (buf_end - it < 4)
{
ASSERT_MES_AND_THROW("Invalid Unicode escape sequence");
}
else
{
uint32_t dst = 0;
for (int i = 0; i < 4; ++i)
{
const unsigned char tmp = isx[(int)*++it];
CHECK_AND_ASSERT_THROW_MES(tmp != 0xff, "Bad Unicode encoding");
dst = dst << 4 | tmp;
}
// encode as UTF-8
if (dst <= 0x7f)
{
val.push_back(dst);
}
else if (dst <= 0x7ff)
{
val.push_back(0xc0 | (dst >> 6));
val.push_back(0x80 | (dst & 0x3f));
}
else if (dst <= 0xffff)
{
val.push_back(0xe0 | (dst >> 12));
val.push_back(0x80 | ((dst >> 6) & 0x3f));
val.push_back(0x80 | (dst & 0x3f));
}
else
{
ASSERT_MES_AND_THROW("Unicode code point is out or range");
}
}
break;
default: default:
val.push_back(*it); val.push_back(*it);
LOG_PRINT_L0("Unknown escape sequence :\"\\" << *it << "\""); LOG_PRINT_L0("Unknown escape sequence :\"\\" << *it << "\"");

View file

@ -59,26 +59,6 @@
#pragma comment (lib, "Rpcrt4.lib") #pragma comment (lib, "Rpcrt4.lib")
#endif #endif
static const constexpr unsigned char isx[256] =
{
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 10, 11, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 10, 11, 12, 13, 14, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
};
namespace epee namespace epee
{ {
namespace string_tools namespace string_tools
@ -99,10 +79,10 @@ namespace string_tools
for(size_t i = 0; i < s.size(); i += 2) for(size_t i = 0; i < s.size(); i += 2)
{ {
int tmp = *src++; int tmp = *src++;
tmp = isx[tmp]; tmp = epee::misc_utils::parse::isx[tmp];
if (tmp == 0xff) return false; if (tmp == 0xff) return false;
int t2 = *src++; int t2 = *src++;
t2 = isx[t2]; t2 = epee::misc_utils::parse::isx[t2];
if (t2 == 0xff) return false; if (t2 == 0xff) return false;
*dst++ = (tmp << 4) | t2; *dst++ = (tmp << 4) | t2;
} }

View file

@ -946,3 +946,20 @@ TEST(parsing, number)
epee::misc_utils::parse::match_number(i, s.end(), val); epee::misc_utils::parse::match_number(i, s.end(), val);
ASSERT_EQ(val, "+9.34e+03"); ASSERT_EQ(val, "+9.34e+03");
} }
TEST(parsing, unicode)
{
std::string bs;
std::string s;
std::string::const_iterator si;
s = "\"\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, "");
s = "\"\\u0000\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, std::string(1, '\0'));
s = "\"\\u0020\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, " ");
s = "\"\\u1\""; si = s.begin(); ASSERT_FALSE(epee::misc_utils::parse::match_string(si, s.end(), bs));
s = "\"\\u12\""; si = s.begin(); ASSERT_FALSE(epee::misc_utils::parse::match_string(si, s.end(), bs));
s = "\"\\u123\""; si = s.begin(); ASSERT_FALSE(epee::misc_utils::parse::match_string(si, s.end(), bs));
s = "\"\\u1234\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, "");
s = "\"foo\\u1234bar\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, "fooሴbar");
s = "\"\\u3042\\u307e\\u3084\\u304b\\u3059\""; si = s.begin(); ASSERT_TRUE(epee::misc_utils::parse::match_string(si, s.end(), bs)); ASSERT_EQ(bs, "あまやかす");
}