Hubbub
|
#include <assert.h>
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include <parserutils/charset/utf8.h>
#include "utils/parserutilserror.h"
#include "utils/utils.h"
#include "hubbub/errors.h"
#include "tokeniser/entities.h"
#include "tokeniser/tokeniser.h"
Go to the source code of this file.
Data Structures | |
struct | hubbub_tokeniser_context |
Context for tokeniser. More... | |
struct | hubbub_tokeniser |
Tokeniser data structure. More... | |
Macros | |
#define | state(x) case x: |
#define | START_BUF(str, cptr, length) |
Various macros for manipulating buffers. More... | |
#define | COLLECT(str, cptr, length) |
#define | COLLECT_MS(str, cptr, length) |
#define | DOCTYPE "DOCTYPE" |
#define | DOCTYPE_LEN (SLEN(DOCTYPE) - 1) |
#define | PUBLIC "PUBLIC" |
#define | PUBLIC_LEN (SLEN(PUBLIC) - 1) |
#define | SYSTEM "SYSTEM" |
#define | SYSTEM_LEN (SLEN(SYSTEM) - 1) |
#define | CDATA "[CDATA[" |
#define | CDATA_LEN (SLEN(CDATA) - 1) |
Typedefs | |
typedef enum hubbub_tokeniser_state | hubbub_tokeniser_state |
Tokeniser states. More... | |
typedef struct hubbub_tokeniser_context | hubbub_tokeniser_context |
Context for tokeniser. More... | |
Variables | |
static const uint32_t | cp1252Table [32] |
Table of mappings between Windows-1252 codepoints 128-159 and UCS4. More... | |
static const uint8_t | u_fffd [3] = { '\xEF', '\xBF', '\xBD' } |
UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER. More... | |
static const hubbub_string | u_fffd_str = { u_fffd, sizeof(u_fffd) } |
static const uint8_t | lf = '\n' |
String for when we want to emit newlines. More... | |
static const hubbub_string | lf_str = { &lf, 1 } |
#define CDATA "[CDATA[" |
Definition at line 2740 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_match_cdata().
Definition at line 2741 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_match_cdata().
#define COLLECT | ( | str, | |
cptr, | |||
length | |||
) |
Definition at line 637 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_attribute_name(), hubbub_tokeniser_handle_attribute_value_uq(), hubbub_tokeniser_handle_doctype_name(), and hubbub_tokeniser_handle_tag_name().
#define COLLECT_MS | ( | str, | |
cptr, | |||
length | |||
) |
Definition at line 648 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_attribute_value_dq(), hubbub_tokeniser_handle_attribute_value_sq(), hubbub_tokeniser_handle_character_reference_in_attribute_value(), hubbub_tokeniser_handle_doctype_public_dq(), hubbub_tokeniser_handle_doctype_public_sq(), hubbub_tokeniser_handle_doctype_system_dq(), and hubbub_tokeniser_handle_doctype_system_sq().
#define DOCTYPE "DOCTYPE" |
Definition at line 1985 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_match_doctype().
Definition at line 1986 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_match_doctype().
#define PUBLIC "PUBLIC" |
Definition at line 2211 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_match_public().
Definition at line 2212 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_match_public().
#define START_BUF | ( | str, | |
cptr, | |||
length | |||
) |
Various macros for manipulating buffers.
make some of these inline functions (type-safety)
document them properly here
Definition at line 627 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_after_attribute_name(), hubbub_tokeniser_handle_before_attribute_name(), hubbub_tokeniser_handle_before_attribute_value(), hubbub_tokeniser_handle_before_doctype_name(), hubbub_tokeniser_handle_close_tag_open(), and hubbub_tokeniser_handle_tag_open().
#define state | ( | x | ) | case x: |
Referenced by hubbub_tokeniser_run().
#define SYSTEM "SYSTEM" |
Definition at line 2466 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_match_system().
Definition at line 2467 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_match_system().
typedef struct hubbub_tokeniser_context hubbub_tokeniser_context |
Context for tokeniser.
typedef enum hubbub_tokeniser_state hubbub_tokeniser_state |
Tokeniser states.
Tokeniser states.
Definition at line 50 of file tokeniser.c.
|
inlinestatic |
Emit a character token.
tokeniser | Tokeniser instance |
chars | Pointer to hubbub_string to emit |
Definition at line 3154 of file tokeniser.c.
References hubbub_token::character, hubbub_token::data, HUBBUB_TOKEN_CHARACTER, hubbub_tokeniser_emit_token(), and hubbub_token::type.
Referenced by hubbub_tokeniser_handle_cdata_block(), and hubbub_tokeniser_handle_data().
|
inlinestatic |
Emit the current pending characters being stored in the tokeniser context.
tokeniser | Tokeniser instance |
Definition at line 3171 of file tokeniser.c.
References hubbub_token::character, hubbub_tokeniser::context, hubbub_token::data, hubbub_error_from_parserutils_error(), HUBBUB_TOKEN_CHARACTER, hubbub_tokeniser_emit_token(), hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tokeniser_context::pending, hubbub_string::ptr, and hubbub_token::type.
Referenced by hubbub_tokeniser_handle_cdata_block(), and hubbub_tokeniser_handle_data().
|
inlinestatic |
Emit the current comment token being stored in the tokeniser context.
tokeniser | Tokeniser instance |
Definition at line 3297 of file tokeniser.c.
References hubbub_tokeniser::buffer, hubbub_token::comment, hubbub_token::data, HUBBUB_TOKEN_COMMENT, hubbub_tokeniser_emit_token(), hubbub_string::len, hubbub_string::ptr, and hubbub_token::type.
Referenced by hubbub_tokeniser_handle_bogus_comment(), and hubbub_tokeniser_handle_comment().
|
inlinestatic |
Emit the current doctype token being stored in the tokeniser context.
tokeniser | Tokeniser instance |
force_quirks | Force quirks mode on this document |
Definition at line 3315 of file tokeniser.c.
References hubbub_tokeniser::buffer, hubbub_tokeniser::context, hubbub_tokeniser_context::current_doctype, hubbub_token::data, hubbub_token::doctype, hubbub_doctype::force_quirks, HUBBUB_TOKEN_DOCTYPE, hubbub_tokeniser_emit_token(), hubbub_string::len, hubbub_doctype::name, hubbub_string::ptr, hubbub_doctype::public_id, hubbub_doctype::public_missing, hubbub_doctype::system_id, hubbub_doctype::system_missing, and hubbub_token::type.
Referenced by hubbub_tokeniser_handle_after_doctype_name(), hubbub_tokeniser_handle_after_doctype_public(), hubbub_tokeniser_handle_after_doctype_system(), hubbub_tokeniser_handle_before_doctype_name(), hubbub_tokeniser_handle_before_doctype_public(), hubbub_tokeniser_handle_before_doctype_system(), hubbub_tokeniser_handle_bogus_doctype(), hubbub_tokeniser_handle_doctype_name(), hubbub_tokeniser_handle_doctype_public_dq(), hubbub_tokeniser_handle_doctype_public_sq(), hubbub_tokeniser_handle_doctype_system_dq(), and hubbub_tokeniser_handle_doctype_system_sq().
|
inlinestatic |
Emit the current tag token being stored in the tokeniser context.
tokeniser | Tokeniser instance |
Definition at line 3198 of file tokeniser.c.
References hubbub_tag::attributes, hubbub_tokeniser::buffer, hubbub_tokeniser::content_model, hubbub_tokeniser::context, hubbub_tokeniser_context::current_tag, hubbub_tokeniser_context::current_tag_type, hubbub_token::data, HUBBUB_CONTENT_MODEL_PCDATA, HUBBUB_NS_HTML, HUBBUB_TOKEN_START_TAG, hubbub_tokeniser_emit_token(), hubbub_tokeniser_context::last_start_tag_len, hubbub_tokeniser_context::last_start_tag_name, hubbub_string::len, hubbub_tag::n_attributes, name, hubbub_attribute::name, hubbub_tag::name, hubbub_tag::ns, hubbub_string::ptr, hubbub_tag::self_closing, hubbub_token::tag, hubbub_token::type, and hubbub_attribute::value.
Referenced by hubbub_tokeniser_handle_after_attribute_name(), hubbub_tokeniser_handle_after_attribute_value_q(), hubbub_tokeniser_handle_attribute_name(), hubbub_tokeniser_handle_attribute_value_dq(), hubbub_tokeniser_handle_attribute_value_sq(), hubbub_tokeniser_handle_attribute_value_uq(), hubbub_tokeniser_handle_before_attribute_name(), hubbub_tokeniser_handle_before_attribute_value(), hubbub_tokeniser_handle_self_closing_start_tag(), and hubbub_tokeniser_handle_tag_name().
|
static |
Definition at line 2875 of file tokeniser.c.
References hubbub_tokeniser_context::allowed_char, hubbub_tokeniser_context::base, hubbub_tokeniser_context::codepoint, hubbub_tokeniser_context::complete, hubbub_tokeniser_context::context, hubbub_tokeniser::context, hubbub_tokeniser_context::had_data, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tokeniser_context::length, hubbub_tokeniser_context::match_entity, hubbub_tokeniser_context::offset, hubbub_tokeniser_context::overflow, hubbub_tokeniser_context::poss_length, hubbub_tokeniser_context::prev_len, hubbub_tokeniser_context::return_state, hubbub_tokeniser::state, STATE_NAMED_ENTITY, and STATE_NUMBERED_ENTITY.
Referenced by hubbub_tokeniser_handle_character_reference_data(), and hubbub_tokeniser_handle_character_reference_in_attribute_value().
hubbub_error hubbub_tokeniser_create | ( | parserutils_inputstream * | input, |
hubbub_tokeniser ** | tokeniser | ||
) |
Create a hubbub tokeniser.
input | Input stream instance |
tokeniser | Pointer to location to receive tokeniser instance |
Definition at line 285 of file tokeniser.c.
References hubbub_tokeniser::buffer, hubbub_tokeniser::content_model, hubbub_tokeniser::context, hubbub_tokeniser::error_handler, hubbub_tokeniser::error_pw, hubbub_tokeniser::escape_flag, HUBBUB_BADPARM, HUBBUB_CONTENT_MODEL_PCDATA, hubbub_error_from_parserutils_error(), HUBBUB_NOMEM, HUBBUB_OK, hubbub_tokeniser::input, hubbub_tokeniser::insert_buf, hubbub_tokeniser::paused, hubbub_tokeniser::process_cdata_section, hubbub_tokeniser::state, STATE_DATA, hubbub_tokeniser::token_handler, and hubbub_tokeniser::token_pw.
Referenced by hubbub_parser_create().
hubbub_error hubbub_tokeniser_destroy | ( | hubbub_tokeniser * | tokeniser | ) |
Destroy a hubbub tokeniser.
tokeniser | The tokeniser instance to destroy |
Definition at line 340 of file tokeniser.c.
References hubbub_tag::attributes, hubbub_tokeniser::buffer, hubbub_tokeniser::context, hubbub_tokeniser_context::current_tag, HUBBUB_BADPARM, HUBBUB_OK, and hubbub_tokeniser::insert_buf.
Referenced by hubbub_parser_create(), and hubbub_parser_destroy().
|
static |
Emit a token, performing sanity checks if necessary.
tokeniser | Tokeniser instance |
token | Token to emit |
Definition at line 3349 of file tokeniser.c.
References hubbub_tag::attributes, hubbub_tokeniser::buffer, hubbub_token::character, hubbub_token::comment, hubbub_tokeniser::context, hubbub_token::data, hubbub_token::doctype, HUBBUB_OK, HUBBUB_PAUSED, HUBBUB_TOKEN_CHARACTER, HUBBUB_TOKEN_COMMENT, HUBBUB_TOKEN_DOCTYPE, HUBBUB_TOKEN_END_TAG, HUBBUB_TOKEN_EOF, HUBBUB_TOKEN_START_TAG, hubbub_tokeniser::input, hubbub_tokeniser::insert_buf, hubbub_string::len, hubbub_tag::n_attributes, hubbub_attribute::name, hubbub_doctype::name, hubbub_tag::name, hubbub_tokeniser::paused, hubbub_tokeniser_context::pending, hubbub_string::ptr, hubbub_doctype::public_id, hubbub_doctype::public_missing, hubbub_doctype::system_id, hubbub_doctype::system_missing, hubbub_token::tag, hubbub_tokeniser::token_handler, hubbub_tokeniser::token_pw, hubbub_token::type, and hubbub_attribute::value.
Referenced by emit_character_token(), emit_current_chars(), emit_current_comment(), emit_current_doctype(), emit_current_tag(), hubbub_tokeniser_handle_character_reference_data(), and hubbub_tokeniser_handle_data().
|
static |
Definition at line 1282 of file tokeniser.c.
References hubbub_tag::attributes, hubbub_tokeniser::context, hubbub_tokeniser_context::current_tag, emit_current_tag(), hubbub_error_from_parserutils_error(), HUBBUB_NOMEM, HUBBUB_NS_NULL, HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tag::n_attributes, hubbub_attribute::name, hubbub_attribute::ns, hubbub_tokeniser_context::pending, hubbub_string::ptr, START_BUF, hubbub_tokeniser::state, STATE_ATTRIBUTE_NAME, STATE_BEFORE_ATTRIBUTE_VALUE, STATE_DATA, STATE_SELF_CLOSING_START_TAG, u_fffd, and hubbub_attribute::value.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1656 of file tokeniser.c.
References hubbub_tokeniser::context, emit_current_tag(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_BEFORE_ATTRIBUTE_NAME, STATE_DATA, and STATE_SELF_CLOSING_START_TAG.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2169 of file tokeniser.c.
References hubbub_tokeniser::context, hubbub_tokeniser_context::count, hubbub_tokeniser_context::current_doctype, emit_current_doctype(), hubbub_doctype::force_quirks, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tokeniser_context::match_doctype, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_BOGUS_DOCTYPE, STATE_DATA, STATE_MATCH_PUBLIC, and STATE_MATCH_SYSTEM.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2417 of file tokeniser.c.
References hubbub_tokeniser::context, hubbub_tokeniser_context::current_doctype, emit_current_doctype(), hubbub_doctype::force_quirks, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_BOGUS_DOCTYPE, STATE_DATA, STATE_DOCTYPE_SYSTEM_DQ, STATE_DOCTYPE_SYSTEM_SQ, hubbub_doctype::system_id, and hubbub_doctype::system_missing.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2672 of file tokeniser.c.
References hubbub_tokeniser::context, emit_current_doctype(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_BOGUS_DOCTYPE, and STATE_DATA.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1226 of file tokeniser.c.
References hubbub_tag::attributes, COLLECT, hubbub_tokeniser::context, hubbub_tokeniser_context::current_tag, emit_current_tag(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tag::n_attributes, hubbub_attribute::name, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_AFTER_ATTRIBUTE_NAME, STATE_BEFORE_ATTRIBUTE_VALUE, STATE_DATA, STATE_SELF_CLOSING_START_TAG, and u_fffd.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1420 of file tokeniser.c.
References hubbub_tokeniser_context::allowed_char, hubbub_tag::attributes, COLLECT_MS, hubbub_tokeniser::context, hubbub_tokeniser_context::current_tag, emit_current_tag(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, lf, hubbub_tag::n_attributes, hubbub_tokeniser_context::pending, hubbub_tokeniser_context::prev_state, hubbub_tokeniser::state, STATE_AFTER_ATTRIBUTE_VALUE_Q, STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE, STATE_DATA, u_fffd, and hubbub_attribute::value.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1482 of file tokeniser.c.
References hubbub_tokeniser_context::allowed_char, hubbub_tag::attributes, COLLECT_MS, hubbub_tokeniser::context, hubbub_tokeniser_context::current_tag, emit_current_tag(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, lf, hubbub_tag::n_attributes, hubbub_tokeniser_context::pending, hubbub_tokeniser_context::prev_state, hubbub_tokeniser::state, STATE_AFTER_ATTRIBUTE_VALUE_Q, STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE, STATE_DATA, u_fffd, and hubbub_attribute::value.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1544 of file tokeniser.c.
References hubbub_tag::attributes, COLLECT, hubbub_tokeniser::context, hubbub_tokeniser_context::current_tag, emit_current_tag(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tag::n_attributes, hubbub_tokeniser_context::pending, hubbub_tokeniser_context::prev_state, hubbub_tokeniser::state, STATE_BEFORE_ATTRIBUTE_NAME, STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE, STATE_DATA, u_fffd, and hubbub_attribute::value.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1154 of file tokeniser.c.
References hubbub_tag::attributes, hubbub_tokeniser::context, hubbub_tokeniser_context::current_tag, emit_current_tag(), hubbub_error_from_parserutils_error(), HUBBUB_NOMEM, HUBBUB_NS_NULL, HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tag::n_attributes, hubbub_attribute::name, hubbub_attribute::ns, hubbub_tokeniser_context::pending, hubbub_string::ptr, START_BUF, hubbub_tokeniser::state, STATE_ATTRIBUTE_NAME, STATE_DATA, STATE_SELF_CLOSING_START_TAG, u_fffd, and hubbub_attribute::value.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1358 of file tokeniser.c.
References hubbub_tag::attributes, hubbub_tokeniser::context, hubbub_tokeniser_context::current_tag, emit_current_tag(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tag::n_attributes, hubbub_tokeniser_context::pending, START_BUF, hubbub_tokeniser::state, STATE_ATTRIBUTE_VALUE_DQ, STATE_ATTRIBUTE_VALUE_SQ, STATE_ATTRIBUTE_VALUE_UQ, STATE_DATA, u_fffd, and hubbub_attribute::value.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2074 of file tokeniser.c.
References hubbub_tokeniser::context, hubbub_tokeniser_context::current_doctype, emit_current_doctype(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_doctype::name, hubbub_tokeniser_context::pending, START_BUF, hubbub_tokeniser::state, STATE_DATA, STATE_DOCTYPE_NAME, and u_fffd.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2258 of file tokeniser.c.
References hubbub_tokeniser::context, hubbub_tokeniser_context::current_doctype, emit_current_doctype(), hubbub_doctype::force_quirks, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tokeniser_context::pending, hubbub_doctype::public_id, hubbub_doctype::public_missing, hubbub_tokeniser::state, STATE_BOGUS_DOCTYPE, STATE_DATA, STATE_DOCTYPE_PUBLIC_DQ, and STATE_DOCTYPE_PUBLIC_SQ.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2513 of file tokeniser.c.
References hubbub_tokeniser::context, hubbub_tokeniser_context::current_doctype, emit_current_doctype(), hubbub_doctype::force_quirks, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_BOGUS_DOCTYPE, STATE_DATA, STATE_DOCTYPE_SYSTEM_DQ, STATE_DOCTYPE_SYSTEM_SQ, hubbub_doctype::system_id, and hubbub_doctype::system_missing.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1735 of file tokeniser.c.
References hubbub_tokeniser::buffer, hubbub_tokeniser::context, emit_current_comment(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, lf, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_DATA, and u_fffd.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2708 of file tokeniser.c.
References hubbub_tokeniser::context, emit_current_doctype(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, and STATE_DATA.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2795 of file tokeniser.c.
References hubbub_tokeniser::context, emit_character_token(), emit_current_chars(), hubbub_tokeniser_context::end, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, lf_str, hubbub_tokeniser_context::match_cdata, hubbub_tokeniser_context::pending, SLEN, hubbub_tokeniser::state, STATE_DATA, and u_fffd_str.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 802 of file tokeniser.c.
References hubbub_token::character, hubbub_tokeniser_context::codepoint, hubbub_tokeniser_context::complete, hubbub_tokeniser::context, hubbub_token::data, hubbub_error_from_parserutils_error(), HUBBUB_OK, HUBBUB_TOKEN_CHARACTER, hubbub_tokeniser_consume_character_reference(), hubbub_tokeniser_emit_token(), hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tokeniser_context::length, hubbub_tokeniser_context::match_entity, hubbub_tokeniser_context::pending, hubbub_string::ptr, hubbub_tokeniser::state, STATE_DATA, and hubbub_token::type.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1599 of file tokeniser.c.
References hubbub_tag::attributes, hubbub_tokeniser_context::codepoint, COLLECT_MS, hubbub_tokeniser_context::complete, hubbub_tokeniser::context, hubbub_tokeniser_context::current_tag, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser_consume_character_reference(), hubbub_tokeniser::input, len, hubbub_tokeniser_context::length, hubbub_tokeniser_context::match_entity, hubbub_tag::n_attributes, hubbub_tokeniser_context::pending, hubbub_tokeniser_context::prev_state, hubbub_tokeniser::state, and hubbub_attribute::value.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 956 of file tokeniser.c.
References hubbub_tokeniser_context::close_tag_match, hubbub_tokeniser::content_model, hubbub_tokeniser::context, hubbub_tokeniser_context::count, hubbub_tokeniser_context::current_tag, hubbub_tokeniser_context::current_tag_type, HUBBUB_CONTENT_MODEL_CDATA, HUBBUB_CONTENT_MODEL_PCDATA, HUBBUB_CONTENT_MODEL_RCDATA, hubbub_error_from_parserutils_error(), HUBBUB_OK, HUBBUB_TOKEN_END_TAG, hubbub_tokeniser::input, hubbub_tokeniser_context::last_start_tag_len, hubbub_tokeniser_context::last_start_tag_name, len, hubbub_tokeniser_context::match, hubbub_tag::n_attributes, hubbub_tag::name, hubbub_tokeniser_context::pending, START_BUF, hubbub_tokeniser::state, STATE_BOGUS_COMMENT, STATE_DATA, and STATE_TAG_NAME.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1873 of file tokeniser.c.
References hubbub_tokeniser::buffer, hubbub_tokeniser::context, emit_current_comment(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, lf, hubbub_tokeniser_context::pending, SLEN, hubbub_tokeniser::state, STATE_COMMENT, STATE_COMMENT_END, STATE_COMMENT_END_DASH, STATE_COMMENT_START, STATE_COMMENT_START_DASH, STATE_DATA, and u_fffd.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 660 of file tokeniser.c.
References hubbub_tokeniser::content_model, hubbub_tokeniser::context, emit_character_token(), emit_current_chars(), hubbub_tokeniser::escape_flag, HUBBUB_CONTENT_MODEL_CDATA, HUBBUB_CONTENT_MODEL_PCDATA, HUBBUB_CONTENT_MODEL_RCDATA, hubbub_error_from_parserutils_error(), HUBBUB_NEEDDATA, HUBBUB_TOKEN_EOF, hubbub_tokeniser_emit_token(), hubbub_tokeniser::input, len, lf_str, hubbub_tokeniser_context::pending, SLEN, hubbub_tokeniser::state, STATE_CHARACTER_REFERENCE_DATA, STATE_DATA, STATE_TAG_OPEN, hubbub_token::type, and u_fffd_str.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2044 of file tokeniser.c.
References hubbub_tokeniser::context, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, and STATE_BEFORE_DOCTYPE_NAME.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2125 of file tokeniser.c.
References COLLECT, hubbub_tokeniser::context, hubbub_tokeniser_context::current_doctype, emit_current_doctype(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_doctype::name, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_AFTER_DOCTYPE_NAME, STATE_DATA, and u_fffd.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2303 of file tokeniser.c.
References COLLECT_MS, hubbub_tokeniser::context, hubbub_tokeniser_context::current_doctype, emit_current_doctype(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, lf, hubbub_tokeniser_context::pending, hubbub_doctype::public_id, hubbub_tokeniser::state, STATE_AFTER_DOCTYPE_PUBLIC, STATE_DATA, and u_fffd.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2360 of file tokeniser.c.
References COLLECT_MS, hubbub_tokeniser::context, hubbub_tokeniser_context::current_doctype, emit_current_doctype(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, lf, hubbub_tokeniser_context::pending, hubbub_doctype::public_id, hubbub_tokeniser::state, STATE_AFTER_DOCTYPE_PUBLIC, STATE_DATA, and u_fffd.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2560 of file tokeniser.c.
References COLLECT_MS, hubbub_tokeniser::context, hubbub_tokeniser_context::current_doctype, emit_current_doctype(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, lf, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_AFTER_DOCTYPE_SYSTEM, STATE_DATA, hubbub_doctype::system_id, and u_fffd.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2616 of file tokeniser.c.
References COLLECT_MS, hubbub_tokeniser::context, hubbub_tokeniser_context::current_doctype, emit_current_doctype(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, lf, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_AFTER_DOCTYPE_SYSTEM, STATE_DATA, hubbub_doctype::system_id, and u_fffd.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1798 of file tokeniser.c.
References hubbub_tokeniser::context, hubbub_tokeniser_context::count, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tokeniser_context::match_cdata, hubbub_tokeniser_context::match_doctype, hubbub_tokeniser_context::pending, hubbub_tokeniser::process_cdata_section, hubbub_tokeniser::state, STATE_BOGUS_COMMENT, STATE_MATCH_CDATA, STATE_MATCH_COMMENT, and STATE_MATCH_DOCTYPE.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2743 of file tokeniser.c.
References CDATA, CDATA_LEN, hubbub_tokeniser::context, hubbub_tokeniser_context::count, hubbub_tokeniser_context::current_comment, hubbub_tokeniser_context::end, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tokeniser_context::match_cdata, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_BOGUS_COMMENT, and STATE_CDATA_BLOCK.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1840 of file tokeniser.c.
References hubbub_tokeniser::context, hubbub_tokeniser_context::current_comment, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tokeniser_context::pending, SLEN, hubbub_tokeniser::state, STATE_BOGUS_COMMENT, and STATE_COMMENT_START.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1988 of file tokeniser.c.
References hubbub_tokeniser::context, hubbub_tokeniser_context::count, hubbub_tokeniser_context::current_comment, hubbub_tokeniser_context::current_doctype, DOCTYPE, DOCTYPE_LEN, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tokeniser_context::match_doctype, hubbub_tokeniser_context::pending, hubbub_doctype::public_missing, hubbub_tokeniser::state, STATE_BOGUS_COMMENT, STATE_DOCTYPE, and hubbub_doctype::system_missing.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2214 of file tokeniser.c.
References hubbub_tokeniser::context, hubbub_tokeniser_context::count, hubbub_tokeniser_context::current_doctype, hubbub_doctype::force_quirks, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tokeniser_context::match_doctype, hubbub_tokeniser_context::pending, PUBLIC, PUBLIC_LEN, hubbub_tokeniser::state, STATE_BEFORE_DOCTYPE_PUBLIC, and STATE_BOGUS_DOCTYPE.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2469 of file tokeniser.c.
References hubbub_tokeniser::context, hubbub_tokeniser_context::count, hubbub_tokeniser_context::current_doctype, hubbub_doctype::force_quirks, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tokeniser_context::match_doctype, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_BEFORE_DOCTYPE_SYSTEM, STATE_BOGUS_DOCTYPE, SYSTEM, and SYSTEM_LEN.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 3047 of file tokeniser.c.
References hubbub_tokeniser_context::codepoint, hubbub_tokeniser_context::complete, hubbub_tokeniser_context::context, hubbub_tokeniser::context, hubbub_entities_search_step(), hubbub_error_from_parserutils_error(), HUBBUB_INVALID, HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tokeniser_context::length, hubbub_tokeniser_context::match_entity, hubbub_tokeniser_context::offset, hubbub_tokeniser_context::poss_length, hubbub_tokeniser_context::return_state, hubbub_tokeniser::state, and STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 2943 of file tokeniser.c.
References hubbub_tokeniser_context::base, hubbub_tokeniser_context::codepoint, hubbub_tokeniser_context::complete, hubbub_tokeniser::context, cp1252Table, hubbub_tokeniser_context::had_data, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tokeniser_context::length, hubbub_tokeniser_context::match_entity, hubbub_tokeniser_context::offset, hubbub_tokeniser_context::overflow, hubbub_tokeniser_context::return_state, and hubbub_tokeniser::state.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1698 of file tokeniser.c.
References hubbub_tokeniser::context, hubbub_tokeniser_context::current_tag, emit_current_tag(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_tokeniser_context::pending, hubbub_tag::self_closing, hubbub_tokeniser::state, STATE_BEFORE_ATTRIBUTE_NAME, and STATE_DATA.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 1101 of file tokeniser.c.
References COLLECT, hubbub_tokeniser::context, hubbub_tokeniser_context::current_tag, emit_current_tag(), hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::input, len, hubbub_string::len, hubbub_tag::name, hubbub_tokeniser_context::pending, hubbub_tokeniser::state, STATE_BEFORE_ATTRIBUTE_NAME, STATE_DATA, STATE_SELF_CLOSING_START_TAG, and u_fffd.
Referenced by hubbub_tokeniser_run().
|
static |
Definition at line 865 of file tokeniser.c.
References hubbub_tokeniser_context::close_tag_match, hubbub_tokeniser::content_model, hubbub_tokeniser::context, hubbub_tokeniser_context::count, hubbub_tokeniser_context::current_tag, hubbub_tokeniser_context::current_tag_type, HUBBUB_CONTENT_MODEL_CDATA, HUBBUB_CONTENT_MODEL_PCDATA, HUBBUB_CONTENT_MODEL_RCDATA, hubbub_error_from_parserutils_error(), HUBBUB_OK, HUBBUB_TOKEN_START_TAG, hubbub_tokeniser::input, len, hubbub_tokeniser_context::match, hubbub_tag::n_attributes, hubbub_tag::name, hubbub_tokeniser_context::pending, SLEN, START_BUF, hubbub_tokeniser::state, STATE_BOGUS_COMMENT, STATE_CLOSE_TAG_OPEN, STATE_DATA, STATE_MARKUP_DECLARATION_OPEN, and STATE_TAG_NAME.
Referenced by hubbub_tokeniser_run().
hubbub_error hubbub_tokeniser_insert_chunk | ( | hubbub_tokeniser * | tokeniser, |
const uint8_t * | data, | ||
size_t | len | ||
) |
Insert a chunk of data into the input stream.
Inserts the given data into the input stream ready for parsing but does not cause any additional processing of the input.
tokeniser | Tokeniser instance |
data | Data to insert (UTF-8 encoded) |
len | Length, in bytes, of data |
Definition at line 415 of file tokeniser.c.
References HUBBUB_BADPARM, hubbub_error_from_parserutils_error(), HUBBUB_OK, hubbub_tokeniser::insert_buf, and len.
Referenced by hubbub_parser_insert_chunk().
hubbub_error hubbub_tokeniser_run | ( | hubbub_tokeniser * | tokeniser | ) |
Process remaining data in the input stream.
tokeniser | The tokeniser instance to invoke |
Definition at line 436 of file tokeniser.c.
References HUBBUB_BADPARM, HUBBUB_NEEDDATA, HUBBUB_OK, HUBBUB_PAUSED, hubbub_tokeniser_handle_after_attribute_name(), hubbub_tokeniser_handle_after_attribute_value_q(), hubbub_tokeniser_handle_after_doctype_name(), hubbub_tokeniser_handle_after_doctype_public(), hubbub_tokeniser_handle_after_doctype_system(), hubbub_tokeniser_handle_attribute_name(), hubbub_tokeniser_handle_attribute_value_dq(), hubbub_tokeniser_handle_attribute_value_sq(), hubbub_tokeniser_handle_attribute_value_uq(), hubbub_tokeniser_handle_before_attribute_name(), hubbub_tokeniser_handle_before_attribute_value(), hubbub_tokeniser_handle_before_doctype_name(), hubbub_tokeniser_handle_before_doctype_public(), hubbub_tokeniser_handle_before_doctype_system(), hubbub_tokeniser_handle_bogus_comment(), hubbub_tokeniser_handle_bogus_doctype(), hubbub_tokeniser_handle_cdata_block(), hubbub_tokeniser_handle_character_reference_data(), hubbub_tokeniser_handle_character_reference_in_attribute_value(), hubbub_tokeniser_handle_close_tag_open(), hubbub_tokeniser_handle_comment(), hubbub_tokeniser_handle_data(), hubbub_tokeniser_handle_doctype(), hubbub_tokeniser_handle_doctype_name(), hubbub_tokeniser_handle_doctype_public_dq(), hubbub_tokeniser_handle_doctype_public_sq(), hubbub_tokeniser_handle_doctype_system_dq(), hubbub_tokeniser_handle_doctype_system_sq(), hubbub_tokeniser_handle_markup_declaration_open(), hubbub_tokeniser_handle_match_cdata(), hubbub_tokeniser_handle_match_comment(), hubbub_tokeniser_handle_match_doctype(), hubbub_tokeniser_handle_match_public(), hubbub_tokeniser_handle_match_system(), hubbub_tokeniser_handle_named_entity(), hubbub_tokeniser_handle_numbered_entity(), hubbub_tokeniser_handle_self_closing_start_tag(), hubbub_tokeniser_handle_tag_name(), hubbub_tokeniser_handle_tag_open(), hubbub_tokeniser::paused, hubbub_tokeniser::state, state, STATE_AFTER_ATTRIBUTE_NAME, STATE_AFTER_ATTRIBUTE_VALUE_Q, STATE_AFTER_DOCTYPE_NAME, STATE_AFTER_DOCTYPE_PUBLIC, STATE_AFTER_DOCTYPE_SYSTEM, STATE_ATTRIBUTE_NAME, STATE_ATTRIBUTE_VALUE_DQ, STATE_ATTRIBUTE_VALUE_SQ, STATE_ATTRIBUTE_VALUE_UQ, STATE_BEFORE_ATTRIBUTE_NAME, STATE_BEFORE_ATTRIBUTE_VALUE, STATE_BEFORE_DOCTYPE_NAME, STATE_BEFORE_DOCTYPE_PUBLIC, STATE_BEFORE_DOCTYPE_SYSTEM, STATE_BOGUS_COMMENT, STATE_BOGUS_DOCTYPE, STATE_CDATA_BLOCK, STATE_CHARACTER_REFERENCE_DATA, STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE, STATE_CLOSE_TAG_OPEN, STATE_COMMENT, STATE_COMMENT_END, STATE_COMMENT_END_DASH, STATE_COMMENT_START, STATE_COMMENT_START_DASH, STATE_DATA, STATE_DOCTYPE, STATE_DOCTYPE_NAME, STATE_DOCTYPE_PUBLIC_DQ, STATE_DOCTYPE_PUBLIC_SQ, STATE_DOCTYPE_SYSTEM_DQ, STATE_DOCTYPE_SYSTEM_SQ, STATE_MARKUP_DECLARATION_OPEN, STATE_MATCH_CDATA, STATE_MATCH_COMMENT, STATE_MATCH_DOCTYPE, STATE_MATCH_PUBLIC, STATE_MATCH_SYSTEM, STATE_NAMED_ENTITY, STATE_NUMBERED_ENTITY, STATE_SELF_CLOSING_START_TAG, STATE_TAG_NAME, and STATE_TAG_OPEN.
Referenced by hubbub_parser_completed(), hubbub_parser_parse_chunk(), and hubbub_tokeniser_setopt().
hubbub_error hubbub_tokeniser_setopt | ( | hubbub_tokeniser * | tokeniser, |
hubbub_tokeniser_opttype | type, | ||
hubbub_tokeniser_optparams * | params | ||
) |
Configure a hubbub tokeniser.
tokeniser | The tokeniser instance to configure |
type | The option type to set |
params | Option-specific parameters |
Definition at line 366 of file tokeniser.c.
References hubbub_tokeniser_optparams::content_model, hubbub_tokeniser::content_model, hubbub_tokeniser_optparams::error_handler, hubbub_tokeniser::error_handler, hubbub_tokeniser::error_pw, hubbub_tokeniser_optparams::handler, HUBBUB_BADPARM, HUBBUB_OK, HUBBUB_TOKENISER_CONTENT_MODEL, HUBBUB_TOKENISER_ERROR_HANDLER, HUBBUB_TOKENISER_PAUSE, HUBBUB_TOKENISER_PROCESS_CDATA, hubbub_tokeniser_run(), HUBBUB_TOKENISER_TOKEN_HANDLER, hubbub_tokeniser_optparams::model, hubbub_tokeniser_optparams::pause_parse, hubbub_tokeniser::paused, hubbub_tokeniser_optparams::process_cdata, hubbub_tokeniser::process_cdata_section, hubbub_tokeniser_optparams::pw, hubbub_tokeniser_optparams::token_handler, hubbub_tokeniser::token_handler, hubbub_tokeniser::token_pw, and type.
Referenced by hubbub_parser_setopt(), hubbub_treebuilder_create(), hubbub_treebuilder_destroy(), parse_generic_rcdata(), and process_plaintext_in_body().
|
static |
Table of mappings between Windows-1252 codepoints 128-159 and UCS4.
Definition at line 26 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_numbered_entity().
|
static |
String for when we want to emit newlines.
Definition at line 43 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_attribute_value_dq(), hubbub_tokeniser_handle_attribute_value_sq(), hubbub_tokeniser_handle_bogus_comment(), hubbub_tokeniser_handle_comment(), hubbub_tokeniser_handle_doctype_public_dq(), hubbub_tokeniser_handle_doctype_public_sq(), hubbub_tokeniser_handle_doctype_system_dq(), and hubbub_tokeniser_handle_doctype_system_sq().
|
static |
Definition at line 44 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_cdata_block(), and hubbub_tokeniser_handle_data().
|
static |
UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER.
Definition at line 36 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_after_attribute_name(), hubbub_tokeniser_handle_attribute_name(), hubbub_tokeniser_handle_attribute_value_dq(), hubbub_tokeniser_handle_attribute_value_sq(), hubbub_tokeniser_handle_attribute_value_uq(), hubbub_tokeniser_handle_before_attribute_name(), hubbub_tokeniser_handle_before_attribute_value(), hubbub_tokeniser_handle_before_doctype_name(), hubbub_tokeniser_handle_bogus_comment(), hubbub_tokeniser_handle_comment(), hubbub_tokeniser_handle_doctype_name(), hubbub_tokeniser_handle_doctype_public_dq(), hubbub_tokeniser_handle_doctype_public_sq(), hubbub_tokeniser_handle_doctype_system_dq(), hubbub_tokeniser_handle_doctype_system_sq(), and hubbub_tokeniser_handle_tag_name().
|
static |
Definition at line 37 of file tokeniser.c.
Referenced by hubbub_tokeniser_handle_cdata_block(), and hubbub_tokeniser_handle_data().