parser.hpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. #pragma once
  2. #include <cassert> // assert
  3. #include <cmath> // isfinite
  4. #include <cstdint> // uint8_t
  5. #include <functional> // function
  6. #include <string> // string
  7. #include <utility> // move
  8. #include <vector> // vector
  9. #include <nlohmann/detail/exceptions.hpp>
  10. #include <nlohmann/detail/input/input_adapters.hpp>
  11. #include <nlohmann/detail/input/json_sax.hpp>
  12. #include <nlohmann/detail/input/lexer.hpp>
  13. #include <nlohmann/detail/macro_scope.hpp>
  14. #include <nlohmann/detail/meta/is_sax.hpp>
  15. #include <nlohmann/detail/value_t.hpp>
  16. namespace nlohmann
  17. {
  18. namespace detail
  19. {
  20. ////////////
  21. // parser //
  22. ////////////
  23. /*!
  24. @brief syntax analysis
  25. This class implements a recursive descent parser.
  26. */
  27. template<typename BasicJsonType>
  28. class parser
  29. {
  30. using number_integer_t = typename BasicJsonType::number_integer_t;
  31. using number_unsigned_t = typename BasicJsonType::number_unsigned_t;
  32. using number_float_t = typename BasicJsonType::number_float_t;
  33. using string_t = typename BasicJsonType::string_t;
  34. using lexer_t = lexer<BasicJsonType>;
  35. using token_type = typename lexer_t::token_type;
  36. public:
  37. enum class parse_event_t : uint8_t
  38. {
  39. /// the parser read `{` and started to process a JSON object
  40. object_start,
  41. /// the parser read `}` and finished processing a JSON object
  42. object_end,
  43. /// the parser read `[` and started to process a JSON array
  44. array_start,
  45. /// the parser read `]` and finished processing a JSON array
  46. array_end,
  47. /// the parser read a key of a value in an object
  48. key,
  49. /// the parser finished reading a JSON value
  50. value
  51. };
  52. using parser_callback_t =
  53. std::function<bool(int depth, parse_event_t event, BasicJsonType& parsed)>;
  54. /// a parser reading from an input adapter
  55. explicit parser(detail::input_adapter_t&& adapter,
  56. const parser_callback_t cb = nullptr,
  57. const bool allow_exceptions_ = true)
  58. : callback(cb), m_lexer(std::move(adapter)), allow_exceptions(allow_exceptions_)
  59. {
  60. // read first token
  61. get_token();
  62. }
  63. /*!
  64. @brief public parser interface
  65. @param[in] strict whether to expect the last token to be EOF
  66. @param[in,out] result parsed JSON value
  67. @throw parse_error.101 in case of an unexpected token
  68. @throw parse_error.102 if to_unicode fails or surrogate error
  69. @throw parse_error.103 if to_unicode fails
  70. */
  71. void parse(const bool strict, BasicJsonType& result)
  72. {
  73. if (callback)
  74. {
  75. json_sax_dom_callback_parser<BasicJsonType> sdp(result, callback, allow_exceptions);
  76. sax_parse_internal(&sdp);
  77. result.assert_invariant();
  78. // in strict mode, input must be completely read
  79. if (strict and (get_token() != token_type::end_of_input))
  80. {
  81. sdp.parse_error(m_lexer.get_position(),
  82. m_lexer.get_token_string(),
  83. parse_error::create(101, m_lexer.get_position(),
  84. exception_message(token_type::end_of_input, "value")));
  85. }
  86. // in case of an error, return discarded value
  87. if (sdp.is_errored())
  88. {
  89. result = value_t::discarded;
  90. return;
  91. }
  92. // set top-level value to null if it was discarded by the callback
  93. // function
  94. if (result.is_discarded())
  95. {
  96. result = nullptr;
  97. }
  98. }
  99. else
  100. {
  101. json_sax_dom_parser<BasicJsonType> sdp(result, allow_exceptions);
  102. sax_parse_internal(&sdp);
  103. result.assert_invariant();
  104. // in strict mode, input must be completely read
  105. if (strict and (get_token() != token_type::end_of_input))
  106. {
  107. sdp.parse_error(m_lexer.get_position(),
  108. m_lexer.get_token_string(),
  109. parse_error::create(101, m_lexer.get_position(),
  110. exception_message(token_type::end_of_input, "value")));
  111. }
  112. // in case of an error, return discarded value
  113. if (sdp.is_errored())
  114. {
  115. result = value_t::discarded;
  116. return;
  117. }
  118. }
  119. }
  120. /*!
  121. @brief public accept interface
  122. @param[in] strict whether to expect the last token to be EOF
  123. @return whether the input is a proper JSON text
  124. */
  125. bool accept(const bool strict = true)
  126. {
  127. json_sax_acceptor<BasicJsonType> sax_acceptor;
  128. return sax_parse(&sax_acceptor, strict);
  129. }
  130. template <typename SAX>
  131. JSON_HEDLEY_NON_NULL(2)
  132. bool sax_parse(SAX* sax, const bool strict = true)
  133. {
  134. (void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
  135. const bool result = sax_parse_internal(sax);
  136. // strict mode: next byte must be EOF
  137. if (result and strict and (get_token() != token_type::end_of_input))
  138. {
  139. return sax->parse_error(m_lexer.get_position(),
  140. m_lexer.get_token_string(),
  141. parse_error::create(101, m_lexer.get_position(),
  142. exception_message(token_type::end_of_input, "value")));
  143. }
  144. return result;
  145. }
  146. private:
  147. template <typename SAX>
  148. JSON_HEDLEY_NON_NULL(2)
  149. bool sax_parse_internal(SAX* sax)
  150. {
  151. // stack to remember the hierarchy of structured values we are parsing
  152. // true = array; false = object
  153. std::vector<bool> states;
  154. // value to avoid a goto (see comment where set to true)
  155. bool skip_to_state_evaluation = false;
  156. while (true)
  157. {
  158. if (not skip_to_state_evaluation)
  159. {
  160. // invariant: get_token() was called before each iteration
  161. switch (last_token)
  162. {
  163. case token_type::begin_object:
  164. {
  165. if (JSON_HEDLEY_UNLIKELY(not sax->start_object(std::size_t(-1))))
  166. {
  167. return false;
  168. }
  169. // closing } -> we are done
  170. if (get_token() == token_type::end_object)
  171. {
  172. if (JSON_HEDLEY_UNLIKELY(not sax->end_object()))
  173. {
  174. return false;
  175. }
  176. break;
  177. }
  178. // parse key
  179. if (JSON_HEDLEY_UNLIKELY(last_token != token_type::value_string))
  180. {
  181. return sax->parse_error(m_lexer.get_position(),
  182. m_lexer.get_token_string(),
  183. parse_error::create(101, m_lexer.get_position(),
  184. exception_message(token_type::value_string, "object key")));
  185. }
  186. if (JSON_HEDLEY_UNLIKELY(not sax->key(m_lexer.get_string())))
  187. {
  188. return false;
  189. }
  190. // parse separator (:)
  191. if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
  192. {
  193. return sax->parse_error(m_lexer.get_position(),
  194. m_lexer.get_token_string(),
  195. parse_error::create(101, m_lexer.get_position(),
  196. exception_message(token_type::name_separator, "object separator")));
  197. }
  198. // remember we are now inside an object
  199. states.push_back(false);
  200. // parse values
  201. get_token();
  202. continue;
  203. }
  204. case token_type::begin_array:
  205. {
  206. if (JSON_HEDLEY_UNLIKELY(not sax->start_array(std::size_t(-1))))
  207. {
  208. return false;
  209. }
  210. // closing ] -> we are done
  211. if (get_token() == token_type::end_array)
  212. {
  213. if (JSON_HEDLEY_UNLIKELY(not sax->end_array()))
  214. {
  215. return false;
  216. }
  217. break;
  218. }
  219. // remember we are now inside an array
  220. states.push_back(true);
  221. // parse values (no need to call get_token)
  222. continue;
  223. }
  224. case token_type::value_float:
  225. {
  226. const auto res = m_lexer.get_number_float();
  227. if (JSON_HEDLEY_UNLIKELY(not std::isfinite(res)))
  228. {
  229. return sax->parse_error(m_lexer.get_position(),
  230. m_lexer.get_token_string(),
  231. out_of_range::create(406, "number overflow parsing '" + m_lexer.get_token_string() + "'"));
  232. }
  233. if (JSON_HEDLEY_UNLIKELY(not sax->number_float(res, m_lexer.get_string())))
  234. {
  235. return false;
  236. }
  237. break;
  238. }
  239. case token_type::literal_false:
  240. {
  241. if (JSON_HEDLEY_UNLIKELY(not sax->boolean(false)))
  242. {
  243. return false;
  244. }
  245. break;
  246. }
  247. case token_type::literal_null:
  248. {
  249. if (JSON_HEDLEY_UNLIKELY(not sax->null()))
  250. {
  251. return false;
  252. }
  253. break;
  254. }
  255. case token_type::literal_true:
  256. {
  257. if (JSON_HEDLEY_UNLIKELY(not sax->boolean(true)))
  258. {
  259. return false;
  260. }
  261. break;
  262. }
  263. case token_type::value_integer:
  264. {
  265. if (JSON_HEDLEY_UNLIKELY(not sax->number_integer(m_lexer.get_number_integer())))
  266. {
  267. return false;
  268. }
  269. break;
  270. }
  271. case token_type::value_string:
  272. {
  273. if (JSON_HEDLEY_UNLIKELY(not sax->string(m_lexer.get_string())))
  274. {
  275. return false;
  276. }
  277. break;
  278. }
  279. case token_type::value_unsigned:
  280. {
  281. if (JSON_HEDLEY_UNLIKELY(not sax->number_unsigned(m_lexer.get_number_unsigned())))
  282. {
  283. return false;
  284. }
  285. break;
  286. }
  287. case token_type::parse_error:
  288. {
  289. // using "uninitialized" to avoid "expected" message
  290. return sax->parse_error(m_lexer.get_position(),
  291. m_lexer.get_token_string(),
  292. parse_error::create(101, m_lexer.get_position(),
  293. exception_message(token_type::uninitialized, "value")));
  294. }
  295. default: // the last token was unexpected
  296. {
  297. return sax->parse_error(m_lexer.get_position(),
  298. m_lexer.get_token_string(),
  299. parse_error::create(101, m_lexer.get_position(),
  300. exception_message(token_type::literal_or_value, "value")));
  301. }
  302. }
  303. }
  304. else
  305. {
  306. skip_to_state_evaluation = false;
  307. }
  308. // we reached this line after we successfully parsed a value
  309. if (states.empty())
  310. {
  311. // empty stack: we reached the end of the hierarchy: done
  312. return true;
  313. }
  314. if (states.back()) // array
  315. {
  316. // comma -> next value
  317. if (get_token() == token_type::value_separator)
  318. {
  319. // parse a new value
  320. get_token();
  321. continue;
  322. }
  323. // closing ]
  324. if (JSON_HEDLEY_LIKELY(last_token == token_type::end_array))
  325. {
  326. if (JSON_HEDLEY_UNLIKELY(not sax->end_array()))
  327. {
  328. return false;
  329. }
  330. // We are done with this array. Before we can parse a
  331. // new value, we need to evaluate the new state first.
  332. // By setting skip_to_state_evaluation to false, we
  333. // are effectively jumping to the beginning of this if.
  334. assert(not states.empty());
  335. states.pop_back();
  336. skip_to_state_evaluation = true;
  337. continue;
  338. }
  339. return sax->parse_error(m_lexer.get_position(),
  340. m_lexer.get_token_string(),
  341. parse_error::create(101, m_lexer.get_position(),
  342. exception_message(token_type::end_array, "array")));
  343. }
  344. else // object
  345. {
  346. // comma -> next value
  347. if (get_token() == token_type::value_separator)
  348. {
  349. // parse key
  350. if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::value_string))
  351. {
  352. return sax->parse_error(m_lexer.get_position(),
  353. m_lexer.get_token_string(),
  354. parse_error::create(101, m_lexer.get_position(),
  355. exception_message(token_type::value_string, "object key")));
  356. }
  357. if (JSON_HEDLEY_UNLIKELY(not sax->key(m_lexer.get_string())))
  358. {
  359. return false;
  360. }
  361. // parse separator (:)
  362. if (JSON_HEDLEY_UNLIKELY(get_token() != token_type::name_separator))
  363. {
  364. return sax->parse_error(m_lexer.get_position(),
  365. m_lexer.get_token_string(),
  366. parse_error::create(101, m_lexer.get_position(),
  367. exception_message(token_type::name_separator, "object separator")));
  368. }
  369. // parse values
  370. get_token();
  371. continue;
  372. }
  373. // closing }
  374. if (JSON_HEDLEY_LIKELY(last_token == token_type::end_object))
  375. {
  376. if (JSON_HEDLEY_UNLIKELY(not sax->end_object()))
  377. {
  378. return false;
  379. }
  380. // We are done with this object. Before we can parse a
  381. // new value, we need to evaluate the new state first.
  382. // By setting skip_to_state_evaluation to false, we
  383. // are effectively jumping to the beginning of this if.
  384. assert(not states.empty());
  385. states.pop_back();
  386. skip_to_state_evaluation = true;
  387. continue;
  388. }
  389. return sax->parse_error(m_lexer.get_position(),
  390. m_lexer.get_token_string(),
  391. parse_error::create(101, m_lexer.get_position(),
  392. exception_message(token_type::end_object, "object")));
  393. }
  394. }
  395. }
  396. /// get next token from lexer
  397. token_type get_token()
  398. {
  399. return last_token = m_lexer.scan();
  400. }
  401. std::string exception_message(const token_type expected, const std::string& context)
  402. {
  403. std::string error_msg = "syntax error ";
  404. if (not context.empty())
  405. {
  406. error_msg += "while parsing " + context + " ";
  407. }
  408. error_msg += "- ";
  409. if (last_token == token_type::parse_error)
  410. {
  411. error_msg += std::string(m_lexer.get_error_message()) + "; last read: '" +
  412. m_lexer.get_token_string() + "'";
  413. }
  414. else
  415. {
  416. error_msg += "unexpected " + std::string(lexer_t::token_type_name(last_token));
  417. }
  418. if (expected != token_type::uninitialized)
  419. {
  420. error_msg += "; expected " + std::string(lexer_t::token_type_name(expected));
  421. }
  422. return error_msg;
  423. }
  424. private:
  425. /// callback function
  426. const parser_callback_t callback = nullptr;
  427. /// the type of the last read token
  428. token_type last_token = token_type::uninitialized;
  429. /// the lexer
  430. lexer_t m_lexer;
  431. /// whether to throw exceptions in case of errors
  432. const bool allow_exceptions = true;
  433. };
  434. } // namespace detail
  435. } // namespace nlohmann