Orcus
Loading...
Searching...
No Matches
sax_parser.hpp
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 */
7
8#include <cstdint>
9
10#ifndef INCLUDED_ORCUS_SAX_PARSER_HPP
11#define INCLUDED_ORCUS_SAX_PARSER_HPP
12
13#include "sax_parser_base.hpp"
14
15#include <string_view>
16#include <cstdint>
17
18namespace orcus {
19
21{
27 static constexpr uint8_t baseline_version = 10;
28};
29
31{
32public:
39 {
40 (void)dtd;
41 }
42
50 void start_declaration(std::string_view decl)
51 {
52 (void)decl;
53 }
54
60 void end_declaration(std::string_view decl)
61 {
62 (void)decl;
63 }
64
71 {
72 (void)elem;
73 }
74
81 {
82 (void)elem;
83 }
84
99 void characters(std::string_view val, bool transient)
100 {
101 (void)val; (void)transient;
102 }
103
113 {
114 (void)attr;
115 }
116};
117
133template<typename HandlerT, typename ConfigT = sax_parser_default_config>
135{
136public:
137 typedef HandlerT handler_type;
138 typedef ConfigT config_type;
139
140 sax_parser(std::string_view content, handler_type& handler);
141 ~sax_parser() = default;
142
143 void parse();
144
145private:
146
151 void header();
152 void body();
153 void element();
154 void element_open(std::ptrdiff_t begin_pos);
155 void element_close(std::ptrdiff_t begin_pos);
156 void special_tag();
157 void declaration(const char* name_check);
158 void cdata();
159 void doctype();
160 void characters();
161 void attribute();
162
163private:
164 handler_type& m_handler;
165};
166
167template<typename HandlerT, typename ConfigT>
168sax_parser<HandlerT,ConfigT>::sax_parser(std::string_view content, handler_type& handler) :
169 sax::parser_base(content.data(), content.size()),
170 m_handler(handler)
171{
172}
173
174template<typename HandlerT, typename ConfigT>
175void sax_parser<HandlerT,ConfigT>::parse()
176{
177 m_nest_level = 0;
178 mp_char = mp_begin;
179 header();
180 skip_space_and_control();
181 body();
182
183 assert(m_buffer_pos == 0);
184}
185
186template<typename HandlerT, typename ConfigT>
187void sax_parser<HandlerT,ConfigT>::header()
188{
189 // we don't handle multi byte encodings so we can just skip bom entry if exists.
190 skip_bom();
191
192 // Allow leading whitespace in the XML stream.
193 // TODO : Make this configurable since strictly speaking such an XML
194 // sttream is invalid.
195 skip_space_and_control();
196
197 if (!has_char() || cur_char() != '<')
198 throw malformed_xml_error("xml file must begin with '<'.", offset());
199
200 if (config_type::baseline_version >= 11)
201 {
202 // XML version 1.1 requires a header declaration whereas in 1.0 it's
203 // optional.
204 if (next_char_checked() != '?')
205 throw malformed_xml_error("xml file must begin with '<?'.", offset());
206
207 declaration("xml");
208 }
209}
210
211template<typename HandlerT, typename ConfigT>
212void sax_parser<HandlerT,ConfigT>::body()
213{
214 while (has_char())
215 {
216 if (cur_char() == '<')
217 {
218 element();
219 if (!m_root_elem_open)
220 // Root element closed. Stop parsing.
221 return;
222 }
223 else if (m_nest_level)
224 // Call characters only when in xml hierarchy.
225 characters();
226 else
227 next();
228 }
229}
230
231template<typename HandlerT, typename ConfigT>
232void sax_parser<HandlerT,ConfigT>::element()
233{
234 assert(cur_char() == '<');
235 std::ptrdiff_t pos = offset();
236 char c = next_char_checked();
237 switch (c)
238 {
239 case '/':
240 element_close(pos);
241 return;
242 case '!':
243 special_tag();
244 return;
245 case '?':
246 declaration(nullptr);
247 return;
248 }
249
250 element_open(pos);
251}
252
253template<typename HandlerT, typename ConfigT>
254void sax_parser<HandlerT,ConfigT>::element_open(std::ptrdiff_t begin_pos)
255{
256 sax::parser_element elem;
257 element_name(elem, begin_pos);
258
259 while (true)
260 {
261 skip_space_and_control();
262 char c = cur_char_checked();
263 if (c == '/')
264 {
265 // Self-closing element: <element/>
266 if (next_and_char() != '>')
267 throw malformed_xml_error("expected '/>' to self-close the element.", offset());
268 next();
269 elem.end_pos = offset();
270 m_handler.start_element(elem);
271 reset_buffer_pos();
272 m_handler.end_element(elem);
273 if (!m_nest_level)
274 m_root_elem_open = false;
275#if ORCUS_DEBUG_SAX_PARSER
276 cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "' (self-closing)" << endl;
277#endif
278 return;
279 }
280 else if (c == '>')
281 {
282 // End of opening element: <element>
283 next();
284 elem.end_pos = offset();
285 nest_up();
286 m_handler.start_element(elem);
287 reset_buffer_pos();
288#if ORCUS_DEBUG_SAX_PARSER
289 cout << "element_open: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
290#endif
291 return;
292 }
293 else
294 attribute();
295 }
296}
297
298template<typename HandlerT, typename ConfigT>
299void sax_parser<HandlerT,ConfigT>::element_close(std::ptrdiff_t begin_pos)
300{
301 assert(cur_char() == '/');
302 nest_down();
303 next_check();
304 sax::parser_element elem;
305 element_name(elem, begin_pos);
306
307 if (cur_char() != '>')
308 throw malformed_xml_error("expected '>' to close the element.", offset());
309 next();
310 elem.end_pos = offset();
311
312 m_handler.end_element(elem);
313#if ORCUS_DEBUG_SAX_PARSER
314 cout << "element_close: ns='" << elem.ns << "', name='" << elem.name << "'" << endl;
315#endif
316 if (!m_nest_level)
317 m_root_elem_open = false;
318}
319
320template<typename HandlerT, typename ConfigT>
321void sax_parser<HandlerT,ConfigT>::special_tag()
322{
323 assert(cur_char() == '!');
324 // This can be either <![CDATA, <!--, or <!DOCTYPE.
325 size_t len = available_size();
326 if (len < 2)
327 throw malformed_xml_error("special tag too short.", offset());
328
329 switch (next_and_char())
330 {
331 case '-':
332 {
333 // Possibly comment.
334 if (next_and_char() != '-')
335 throw malformed_xml_error("comment expected.", offset());
336
337 len -= 2;
338 if (len < 3)
339 throw malformed_xml_error("malformed comment.", offset());
340
341 next();
342 comment();
343 }
344 break;
345 case '[':
346 {
347 // Possibly a CDATA.
348 expects_next("CDATA[", 6);
349 if (has_char())
350 cdata();
351 }
352 break;
353 case 'D':
354 {
355 // check if this is a DOCTYPE.
356 expects_next("OCTYPE", 6);
357 skip_space_and_control();
358 if (has_char())
359 doctype();
360 }
361 break;
362 default:
363 throw malformed_xml_error("failed to parse special tag.", offset());
364 }
365}
366
367template<typename HandlerT, typename ConfigT>
368void sax_parser<HandlerT,ConfigT>::declaration(const char* name_check)
369{
370 assert(cur_char() == '?');
371 next_check();
372
373 // Get the declaration name first.
374 std::string_view decl_name;
375 name(decl_name);
376#if ORCUS_DEBUG_SAX_PARSER
377 cout << "sax_parser::declaration: start name='" << decl_name << "'" << endl;
378#endif
379
380 if (name_check && decl_name != name_check)
381 {
382 std::ostringstream os;
383 os << "declaration name of '" << name_check << "' was expected, but '" << decl_name << "' was found instead.";
384 throw malformed_xml_error(os.str(), offset());
385 }
386
387 m_handler.start_declaration(decl_name);
388 skip_space_and_control();
389
390 // Parse the attributes.
391 while (cur_char_checked() != '?')
392 {
393 attribute();
394 skip_space_and_control();
395 }
396 if (next_char_checked() != '>')
397 throw malformed_xml_error("declaration must end with '?>'.", offset());
398
399 m_handler.end_declaration(decl_name);
400 reset_buffer_pos();
401 next();
402#if ORCUS_DEBUG_SAX_PARSER
403 cout << "sax_parser::declaration: end name='" << decl_name << "'" << endl;
404#endif
405}
406
407template<typename HandlerT, typename ConfigT>
408void sax_parser<HandlerT,ConfigT>::cdata()
409{
410 size_t len = available_size();
411 assert(len > 3);
412
413 // Parse until we reach ']]>'.
414 const char* p0 = mp_char;
415 size_t i = 0, match = 0;
416 for (char c = cur_char(); i < len; ++i, c = next_and_char())
417 {
418 if (c == ']')
419 {
420 // Be aware that we may encounter a series of more than two ']'
421 // characters, in which case we'll only count the last two.
422
423 if (match == 0)
424 // First ']'
425 ++match;
426 else if (match == 1)
427 // Second ']'
428 ++match;
429 }
430 else if (c == '>' && match == 2)
431 {
432 // Found ']]>'.
433 size_t cdata_len = i - 2;
434 m_handler.characters(std::string_view(p0, cdata_len), false);
435 next();
436 return;
437 }
438 else
439 match = 0;
440 }
441 throw malformed_xml_error("malformed CDATA section.", offset());
442}
443
444template<typename HandlerT, typename ConfigT>
445void sax_parser<HandlerT,ConfigT>::doctype()
446{
447 // Parse the root element first.
448 sax::doctype_declaration param;
449 name(param.root_element);
450 skip_space_and_control();
451
452 // Either PUBLIC or SYSTEM.
453 size_t len = available_size();
454 if (len < 6)
455 throw malformed_xml_error("DOCTYPE section too short.", offset());
456
457 param.keyword = sax::doctype_declaration::keyword_type::dtd_private;
458 char c = cur_char();
459 if (c == 'P')
460 {
461 if (next_and_char() != 'U' || next_and_char() != 'B' || next_and_char() != 'L' || next_and_char() != 'I' || next_and_char() != 'C')
462 throw malformed_xml_error("malformed DOCTYPE section.", offset());
463
464 param.keyword = sax::doctype_declaration::keyword_type::dtd_public;
465 }
466 else if (c == 'S')
467 {
468 if (next_and_char() != 'Y' || next_and_char() != 'S' || next_and_char() != 'T' || next_and_char() != 'E' || next_and_char() != 'M')
469 throw malformed_xml_error("malformed DOCTYPE section.", offset());
470 }
471
472 next_check();
473 skip_space_and_control();
474
475 // Parse FPI.
476 value(param.fpi, false);
477
478 has_char_throw("DOCTYPE section too short.");
479 skip_space_and_control();
480 has_char_throw("DOCTYPE section too short.");
481
482 if (cur_char() == '>')
483 {
484 // Optional URI not given. Exit.
485#if ORCUS_DEBUG_SAX_PARSER
486 cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "'" << endl;
487#endif
488 m_handler.doctype(param);
489 next();
490 return;
491 }
492
493 // Parse optional URI.
494 value(param.uri, false);
495
496 has_char_throw("DOCTYPE section too short.");
497 skip_space_and_control();
498 has_char_throw("DOCTYPE section too short.");
499
500 if (cur_char() != '>')
501 throw malformed_xml_error("malformed DOCTYPE section - closing '>' expected but not found.", offset());
502
503#if ORCUS_DEBUG_SAX_PARSER
504 cout << "sax_parser::doctype: root='" << param.root_element << "', fpi='" << param.fpi << "' uri='" << param.uri << "'" << endl;
505#endif
506 m_handler.doctype(param);
507 next();
508}
509
510template<typename HandlerT, typename ConfigT>
511void sax_parser<HandlerT,ConfigT>::characters()
512{
513 const char* p0 = mp_char;
514 for (; has_char(); next())
515 {
516 if (cur_char() == '<')
517 break;
518
519 if (cur_char() == '&')
520 {
521 // Text span with one or more encoded characters. Parse using cell buffer.
522 cell_buffer& buf = get_cell_buffer();
523 buf.reset();
524 buf.append(p0, mp_char-p0);
525 characters_with_encoded_char(buf);
526 if (buf.empty())
527 m_handler.characters(std::string_view{}, false);
528 else
529 m_handler.characters(buf.str(), true);
530 return;
531 }
532 }
533
534 if (mp_char > p0)
535 {
536 std::string_view val(p0, mp_char-p0);
537 m_handler.characters(val, false);
538 }
539}
540
541template<typename HandlerT, typename ConfigT>
542void sax_parser<HandlerT,ConfigT>::attribute()
543{
544 sax::parser_attribute attr;
545 attribute_name(attr.ns, attr.name);
546
547#if ORCUS_DEBUG_SAX_PARSER
548 cout << "sax_parser::attribute: ns='" << attr.ns << "', name='" << attr.name << "'" << endl;
549#endif
550
551 skip_space_and_control();
552
553 char c = cur_char_checked();
554 if (c != '=')
555 {
556 std::ostringstream os;
557 os << "Attribute must begin with 'name=..'. (ns='" << attr.ns << "', name='" << attr.name << "')";
558 throw malformed_xml_error(os.str(), offset());
559 }
560
561 next_check(); // skip the '='.
562 skip_space_and_control();
563
564 attr.transient = value(attr.value, true);
565 if (attr.transient)
566 // Value is stored in a temporary buffer. Push a new buffer.
567 inc_buffer_pos();
568
569#if ORCUS_DEBUG_SAX_PARSER
570 cout << "sax_parser::attribute: value='" << attr.value << "'" << endl;
571#endif
572
573 m_handler.attribute(attr);
574}
575
576}
577
578#endif
579/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition parser_base.hpp:23
Definition sax_parser_base.hpp:108
Definition sax_parser.hpp:31
void end_declaration(std::string_view decl)
Definition sax_parser.hpp:60
void doctype(const orcus::sax::doctype_declaration &dtd)
Definition sax_parser.hpp:38
void attribute(const orcus::sax::parser_attribute &attr)
Definition sax_parser.hpp:112
void characters(std::string_view val, bool transient)
Definition sax_parser.hpp:99
void start_declaration(std::string_view decl)
Definition sax_parser.hpp:50
void end_element(const orcus::sax::parser_element &elem)
Definition sax_parser.hpp:80
void start_element(const orcus::sax::parser_element &elem)
Definition sax_parser.hpp:70
Definition sax_parser.hpp:135
Definition sax_parser_base.hpp:37
Definition sax_parser_base.hpp:96
Definition sax_parser_base.hpp:77
Definition sax_parser.hpp:21
static constexpr uint8_t baseline_version
Definition sax_parser.hpp:27