input_adapters.hpp 16 KB


  1. #pragma once
  2. #include <array> // array
  3. #include <cassert> // assert
  4. #include <cstddef> // size_t
  5. #include <cstdio> //FILE *
  6. #include <cstring> // strlen
  7. #include <istream> // istream
  8. #include <iterator> // begin, end, iterator_traits, random_access_iterator_tag, distance, next
  9. #include <memory> // shared_ptr, make_shared, addressof
  10. #include <numeric> // accumulate
  11. #include <string> // string, char_traits
  12. #include <type_traits> // enable_if, is_base_of, is_pointer, is_integral, remove_pointer
  13. #include <utility> // pair, declval
  14. #include <nlohmann/detail/iterators/iterator_traits.hpp>
  15. #include <nlohmann/detail/macro_scope.hpp>
  16. namespace nlohmann
  17. {
  18. namespace detail
  19. {
  20. /// the supported input formats
  21. enum class input_format_t { json, cbor, msgpack, ubjson, bson };
  22. ////////////////////
  23. // input adapters //
  24. ////////////////////
  25. /*!
  26. @brief abstract input adapter interface
  27. Produces a stream of std::char_traits<char>::int_type characters from a
  28. std::istream, a buffer, or some other input type. Accepts the return of
  29. exactly one non-EOF character for future input. The int_type characters
  30. returned consist of all valid char values as positive values (typically
  31. unsigned char), plus an EOF value outside that range, specified by the value
  32. of the function std::char_traits<char>::eof(). This value is typically -1, but
  33. could be any arbitrary value which is not a valid char value.
  34. */
  35. struct input_adapter_protocol
  36. {
  37. /// get a character [0,255] or std::char_traits<char>::eof().
  38. virtual std::char_traits<char>::int_type get_character() = 0;
  39. virtual ~input_adapter_protocol() = default;
  40. };
  41. /// a type to simplify interfaces
  42. using input_adapter_t = std::shared_ptr<input_adapter_protocol>;
  43. /*!
  44. Input adapter for stdio file access. This adapter read only 1 byte and do not use any
  45. buffer. This adapter is a very low level adapter.
  46. */
  47. class file_input_adapter : public input_adapter_protocol
  48. {
  49. public:
  50. JSON_HEDLEY_NON_NULL(2)
  51. explicit file_input_adapter(std::FILE* f) noexcept
  52. : m_file(f)
  53. {}
  54. // make class move-only
  55. file_input_adapter(const file_input_adapter&) = delete;
  56. file_input_adapter(file_input_adapter&&) = default;
  57. file_input_adapter& operator=(const file_input_adapter&) = delete;
  58. file_input_adapter& operator=(file_input_adapter&&) = default;
  59. ~file_input_adapter() override = default;
  60. std::char_traits<char>::int_type get_character() noexcept override
  61. {
  62. return std::fgetc(m_file);
  63. }
  64. private:
  65. /// the file pointer to read from
  66. std::FILE* m_file;
  67. };
  68. /*!
  69. Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at
  70. beginning of input. Does not support changing the underlying std::streambuf
  71. in mid-input. Maintains underlying std::istream and std::streambuf to support
  72. subsequent use of standard std::istream operations to process any input
  73. characters following those used in parsing the JSON input. Clears the
  74. std::istream flags; any input errors (e.g., EOF) will be detected by the first
  75. subsequent call for input from the std::istream.
  76. */
  77. class input_stream_adapter : public input_adapter_protocol
  78. {
  79. public:
  80. ~input_stream_adapter() override
  81. {
  82. // clear stream flags; we use underlying streambuf I/O, do not
  83. // maintain ifstream flags, except eof
  84. is.clear(is.rdstate() & std::ios::eofbit);
  85. }
  86. explicit input_stream_adapter(std::istream& i)
  87. : is(i), sb(*i.rdbuf())
  88. {}
  89. // delete because of pointer members
  90. input_stream_adapter(const input_stream_adapter&) = delete;
  91. input_stream_adapter& operator=(input_stream_adapter&) = delete;
  92. input_stream_adapter(input_stream_adapter&&) = delete;
  93. input_stream_adapter& operator=(input_stream_adapter&&) = delete;
  94. // std::istream/std::streambuf use std::char_traits<char>::to_int_type, to
  95. // ensure that std::char_traits<char>::eof() and the character 0xFF do not
  96. // end up as the same value, eg. 0xFFFFFFFF.
  97. std::char_traits<char>::int_type get_character() override
  98. {
  99. auto res = sb.sbumpc();
  100. // set eof manually, as we don't use the istream interface.
  101. if (res == EOF)
  102. {
  103. is.clear(is.rdstate() | std::ios::eofbit);
  104. }
  105. return res;
  106. }
  107. private:
  108. /// the associated input stream
  109. std::istream& is;
  110. std::streambuf& sb;
  111. };
  112. /// input adapter for buffer input
  113. class input_buffer_adapter : public input_adapter_protocol
  114. {
  115. public:
  116. input_buffer_adapter(const char* b, const std::size_t l) noexcept
  117. : cursor(b), limit(b == nullptr ? nullptr : (b + l))
  118. {}
  119. // delete because of pointer members
  120. input_buffer_adapter(const input_buffer_adapter&) = delete;
  121. input_buffer_adapter& operator=(input_buffer_adapter&) = delete;
  122. input_buffer_adapter(input_buffer_adapter&&) = delete;
  123. input_buffer_adapter& operator=(input_buffer_adapter&&) = delete;
  124. ~input_buffer_adapter() override = default;
  125. std::char_traits<char>::int_type get_character() noexcept override
  126. {
  127. if (JSON_HEDLEY_LIKELY(cursor < limit))
  128. {
  129. assert(cursor != nullptr and limit != nullptr);
  130. return std::char_traits<char>::to_int_type(*(cursor++));
  131. }
  132. return std::char_traits<char>::eof();
  133. }
  134. private:
  135. /// pointer to the current character
  136. const char* cursor;
  137. /// pointer past the last character
  138. const char* const limit;
  139. };
  140. template<typename WideStringType, size_t T>
  141. struct wide_string_input_helper
  142. {
  143. // UTF-32
  144. static void fill_buffer(const WideStringType& str,
  145. size_t& current_wchar,
  146. std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
  147. size_t& utf8_bytes_index,
  148. size_t& utf8_bytes_filled)
  149. {
  150. utf8_bytes_index = 0;
  151. if (current_wchar == str.size())
  152. {
  153. utf8_bytes[0] = std::char_traits<char>::eof();
  154. utf8_bytes_filled = 1;
  155. }
  156. else
  157. {
  158. // get the current character
  159. const auto wc = static_cast<unsigned int>(str[current_wchar++]);
  160. // UTF-32 to UTF-8 encoding
  161. if (wc < 0x80)
  162. {
  163. utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
  164. utf8_bytes_filled = 1;
  165. }
  166. else if (wc <= 0x7FF)
  167. {
  168. utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((wc >> 6u) & 0x1Fu));
  169. utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (wc & 0x3Fu));
  170. utf8_bytes_filled = 2;
  171. }
  172. else if (wc <= 0xFFFF)
  173. {
  174. utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((wc >> 12u) & 0x0Fu));
  175. utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((wc >> 6u) & 0x3Fu));
  176. utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (wc & 0x3Fu));
  177. utf8_bytes_filled = 3;
  178. }
  179. else if (wc <= 0x10FFFF)
  180. {
  181. utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | ((wc >> 18u) & 0x07u));
  182. utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((wc >> 12u) & 0x3Fu));
  183. utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((wc >> 6u) & 0x3Fu));
  184. utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (wc & 0x3Fu));
  185. utf8_bytes_filled = 4;
  186. }
  187. else
  188. {
  189. // unknown character
  190. utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
  191. utf8_bytes_filled = 1;
  192. }
  193. }
  194. }
  195. };
  196. template<typename WideStringType>
  197. struct wide_string_input_helper<WideStringType, 2>
  198. {
  199. // UTF-16
  200. static void fill_buffer(const WideStringType& str,
  201. size_t& current_wchar,
  202. std::array<std::char_traits<char>::int_type, 4>& utf8_bytes,
  203. size_t& utf8_bytes_index,
  204. size_t& utf8_bytes_filled)
  205. {
  206. utf8_bytes_index = 0;
  207. if (current_wchar == str.size())
  208. {
  209. utf8_bytes[0] = std::char_traits<char>::eof();
  210. utf8_bytes_filled = 1;
  211. }
  212. else
  213. {
  214. // get the current character
  215. const auto wc = static_cast<unsigned int>(str[current_wchar++]);
  216. // UTF-16 to UTF-8 encoding
  217. if (wc < 0x80)
  218. {
  219. utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
  220. utf8_bytes_filled = 1;
  221. }
  222. else if (wc <= 0x7FF)
  223. {
  224. utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xC0u | ((wc >> 6u)));
  225. utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | (wc & 0x3Fu));
  226. utf8_bytes_filled = 2;
  227. }
  228. else if (0xD800 > wc or wc >= 0xE000)
  229. {
  230. utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xE0u | ((wc >> 12u)));
  231. utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((wc >> 6u) & 0x3Fu));
  232. utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | (wc & 0x3Fu));
  233. utf8_bytes_filled = 3;
  234. }
  235. else
  236. {
  237. if (current_wchar < str.size())
  238. {
  239. const auto wc2 = static_cast<unsigned int>(str[current_wchar++]);
  240. const auto charcode = 0x10000u + (((wc & 0x3FFu) << 10u) | (wc2 & 0x3FFu));
  241. utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(0xF0u | (charcode >> 18u));
  242. utf8_bytes[1] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 12u) & 0x3Fu));
  243. utf8_bytes[2] = static_cast<std::char_traits<char>::int_type>(0x80u | ((charcode >> 6u) & 0x3Fu));
  244. utf8_bytes[3] = static_cast<std::char_traits<char>::int_type>(0x80u | (charcode & 0x3Fu));
  245. utf8_bytes_filled = 4;
  246. }
  247. else
  248. {
  249. // unknown character
  250. ++current_wchar;
  251. utf8_bytes[0] = static_cast<std::char_traits<char>::int_type>(wc);
  252. utf8_bytes_filled = 1;
  253. }
  254. }
  255. }
  256. }
  257. };
  258. template<typename WideStringType>
  259. class wide_string_input_adapter : public input_adapter_protocol
  260. {
  261. public:
  262. explicit wide_string_input_adapter(const WideStringType& w) noexcept
  263. : str(w)
  264. {}
  265. std::char_traits<char>::int_type get_character() noexcept override
  266. {
  267. // check if buffer needs to be filled
  268. if (utf8_bytes_index == utf8_bytes_filled)
  269. {
  270. fill_buffer<sizeof(typename WideStringType::value_type)>();
  271. assert(utf8_bytes_filled > 0);
  272. assert(utf8_bytes_index == 0);
  273. }
  274. // use buffer
  275. assert(utf8_bytes_filled > 0);
  276. assert(utf8_bytes_index < utf8_bytes_filled);
  277. return utf8_bytes[utf8_bytes_index++];
  278. }
  279. private:
  280. template<size_t T>
  281. void fill_buffer()
  282. {
  283. wide_string_input_helper<WideStringType, T>::fill_buffer(str, current_wchar, utf8_bytes, utf8_bytes_index, utf8_bytes_filled);
  284. }
  285. /// the wstring to process
  286. const WideStringType& str;
  287. /// index of the current wchar in str
  288. std::size_t current_wchar = 0;
  289. /// a buffer for UTF-8 bytes
  290. std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
  291. /// index to the utf8_codes array for the next valid byte
  292. std::size_t utf8_bytes_index = 0;
  293. /// number of valid bytes in the utf8_codes array
  294. std::size_t utf8_bytes_filled = 0;
  295. };
  296. class input_adapter
  297. {
  298. public:
  299. // native support
  300. JSON_HEDLEY_NON_NULL(2)
  301. input_adapter(std::FILE* file)
  302. : ia(std::make_shared<file_input_adapter>(file)) {}
  303. /// input adapter for input stream
  304. input_adapter(std::istream& i)
  305. : ia(std::make_shared<input_stream_adapter>(i)) {}
  306. /// input adapter for input stream
  307. input_adapter(std::istream&& i)
  308. : ia(std::make_shared<input_stream_adapter>(i)) {}
  309. input_adapter(const std::wstring& ws)
  310. : ia(std::make_shared<wide_string_input_adapter<std::wstring>>(ws)) {}
  311. input_adapter(const std::u16string& ws)
  312. : ia(std::make_shared<wide_string_input_adapter<std::u16string>>(ws)) {}
  313. input_adapter(const std::u32string& ws)
  314. : ia(std::make_shared<wide_string_input_adapter<std::u32string>>(ws)) {}
  315. /// input adapter for buffer
  316. template<typename CharT,
  317. typename std::enable_if<
  318. std::is_pointer<CharT>::value and
  319. std::is_integral<typename std::remove_pointer<CharT>::type>::value and
  320. sizeof(typename std::remove_pointer<CharT>::type) == 1,
  321. int>::type = 0>
  322. input_adapter(CharT b, std::size_t l)
  323. : ia(std::make_shared<input_buffer_adapter>(reinterpret_cast<const char*>(b), l)) {}
  324. // derived support
  325. /// input adapter for string literal
  326. template<typename CharT,
  327. typename std::enable_if<
  328. std::is_pointer<CharT>::value and
  329. std::is_integral<typename std::remove_pointer<CharT>::type>::value and
  330. sizeof(typename std::remove_pointer<CharT>::type) == 1,
  331. int>::type = 0>
  332. input_adapter(CharT b)
  333. : input_adapter(reinterpret_cast<const char*>(b),
  334. std::strlen(reinterpret_cast<const char*>(b))) {}
  335. /// input adapter for iterator range with contiguous storage
  336. template<class IteratorType,
  337. typename std::enable_if<
  338. std::is_same<typename iterator_traits<IteratorType>::iterator_category, std::random_access_iterator_tag>::value,
  339. int>::type = 0>
  340. input_adapter(IteratorType first, IteratorType last)
  341. {
  342. #ifndef NDEBUG
  343. // assertion to check that the iterator range is indeed contiguous,
  344. // see https://stackoverflow.com/a/35008842/266378 for more discussion
  345. const auto is_contiguous = std::accumulate(
  346. first, last, std::pair<bool, int>(true, 0),
  347. [&first](std::pair<bool, int> res, decltype(*first) val)
  348. {
  349. res.first &= (val == *(std::next(std::addressof(*first), res.second++)));
  350. return res;
  351. }).first;
  352. assert(is_contiguous);
  353. #endif
  354. // assertion to check that each element is 1 byte long
  355. static_assert(
  356. sizeof(typename iterator_traits<IteratorType>::value_type) == 1,
  357. "each element in the iterator range must have the size of 1 byte");
  358. const auto len = static_cast<size_t>(std::distance(first, last));
  359. if (JSON_HEDLEY_LIKELY(len > 0))
  360. {
  361. // there is at least one element: use the address of first
  362. ia = std::make_shared<input_buffer_adapter>(reinterpret_cast<const char*>(&(*first)), len);
  363. }
  364. else
  365. {
  366. // the address of first cannot be used: use nullptr
  367. ia = std::make_shared<input_buffer_adapter>(nullptr, len);
  368. }
  369. }
  370. /// input adapter for array
  371. template<class T, std::size_t N>
  372. input_adapter(T (&array)[N])
  373. : input_adapter(std::begin(array), std::end(array)) {}
  374. /// input adapter for contiguous container
  375. template<class ContiguousContainer, typename
  376. std::enable_if<not std::is_pointer<ContiguousContainer>::value and
  377. std::is_base_of<std::random_access_iterator_tag, typename iterator_traits<decltype(std::begin(std::declval<ContiguousContainer const>()))>::iterator_category>::value,
  378. int>::type = 0>
  379. input_adapter(const ContiguousContainer& c)
  380. : input_adapter(std::begin(c), std::end(c)) {}
  381. operator input_adapter_t()
  382. {
  383. return ia;
  384. }
  385. private:
  386. /// the actual adapter
  387. input_adapter_t ia = nullptr;
  388. };
  389. } // namespace detail
  390. } // namespace nlohmann