% \iffalse meta-comment % %% File: l3tl-analysis.dtx % % Copyright (C) 2011-2024 The LaTeX Project % % It may be distributed and/or modified under the conditions of the % LaTeX Project Public License (LPPL), either version 1.3c of this % license or (at your option) any later version. The latest version % of this license is in the file % % https://www.latex-project.org/lppl.txt % % This file is part of the "l3kernel bundle" (The Work in LPPL) % and all files in that bundle must be distributed together. % % ----------------------------------------------------------------------- % % The development version of the bundle can be found at % % https://github.com/latex3/latex3 % % for those people who are interested. % %<*driver> \documentclass[full,kernel]{l3doc} \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % % % \title{^^A % The \pkg{l3tl-analysis} module\\ Analysing token lists^^A % } % % \author{^^A % The \LaTeX{} Project\thanks % {^^A % E-mail: % \href{mailto:latex-team@latex-project.org} % {latex-team@latex-project.org}^^A % }^^A % } % % \date{Released 2024-12-09} % % \maketitle % % \begin{documentation} % % This module provides functions that are particularly useful in the % \pkg{l3regex} module for mapping through a token list one \meta{token} % at a time (including begin-group/end-group tokens). For % \cs{tl_analysis_map_inline:Nn} or \cs{tl_analysis_map_inline:nn}, the % token list is given as an argument; the analogous function % \cs{peek_analysis_map_inline:n} documented in \pkg{l3token} finds % tokens in the input stream instead. In both cases the user provides % \meta{inline code} that receives three arguments for each % \meta{token}: % \begin{itemize} % \item \meta{tokens}, which both \texttt{o}-expand and % \texttt{e}/\texttt{x}-expand to the \meta{token}. The detailed form of % \meta{tokens} may change in later releases. % \item \meta{char code}, a decimal representation of the character % code of the \meta{token}, $-1$ if it is a control sequence. % \item \meta{catcode}, a capital hexadecimal digit which denotes the % category code of the \meta{token} (0:~control sequence, % 1:~begin-group, 2:~end-group, 3:~math shift, 4:~alignment tab, % 6:~parameter, 7:~superscript, 8:~subscript, A:~space, B:~letter, % C:~other, D:~active). This can be converted to an integer by % writing |"|\meta{catcode}. % \end{itemize} % In addition, there is a debugging function \cs{tl_analysis_show:n}, % very similar to the \cs[no-index]{ShowTokens} macro from the \pkg{ted} package. % % \begin{function}[added = 2021-05-11] % { % \tl_analysis_show:N, \tl_analysis_show:n, % \tl_analysis_log:N, \tl_analysis_log:n % } % \begin{syntax} % \cs{tl_analysis_show:n} \Arg{token list} % \cs{tl_analysis_log:n} \Arg{token list} % \end{syntax} % Displays to the terminal (or log) the detailed decomposition of the % \meta{token list} into tokens, showing the category code of each % character token, the meaning of control sequences and active % characters, and the value of registers. % \end{function} % % \begin{function}[added = 2018-04-09, updated = 2022-03-26] % {\tl_analysis_map_inline:nn, \tl_analysis_map_inline:Nn} % \begin{syntax} % \cs{tl_analysis_map_inline:nn} \Arg{token list} \Arg{inline function} % \end{syntax} % Applies the \meta{inline function} to each individual \meta{token} % in the \meta{token list}. The \meta{inline function} receives three % arguments as explained above. As all other mappings the mapping is % done at the current group level, \emph{i.e.}~any local assignments % made by the \meta{inline function} remain in effect after the loop. % \end{function} % % \end{documentation} % % \begin{implementation} % % \section{\pkg{l3tl-analysis} implementation} % % \begin{macrocode} %<@@=tl> % \end{macrocode} % % \subsection{Internal functions} % % \begin{variable}{\s_@@} % The format used to store token lists internally uses the scan mark % \cs{s_@@} as a delimiter. % \end{variable} % % \subsection{Internal format} % % The task of the \pkg{l3tl-analysis} module is to convert token lists % to an internal format which allows us to extract all the relevant % information about individual tokens (category code, character code), % as well as reconstruct the token list quickly. This internal format is % used in \pkg{l3regex} where we need to support arbitrary tokens, and % it is used in conversion functions in \pkg{l3str-convert}, where we wish to % support clusters of characters instead of single tokens. % % We thus need a way to encode any \meta{token} (even begin-group and % end-group character tokens) in a way amenable to manipulating tokens % individually. The best we can do is to find \meta{tokens} which both % \texttt{o}-expand and \texttt{e}/\texttt{x}-expand to the given % \meta{token}. Collecting more information about the category code and % character code is also useful for regular expressions, since most % regexes are catcode-agnostic. The internal format thus takes the form % of a succession of items of the form % \begin{quote} % \meta{tokens} \cs{s_@@} \meta{catcode} \meta{char code} \cs{s_@@} % \end{quote} % The \meta{tokens} \texttt{o}- \emph{and} \texttt{e}/\texttt{x}-expand to the % original token in the token list or to the cluster of tokens % corresponding to one Unicode character in the given encoding (for % \pkg{l3str-convert}). The \meta{catcode} is given as a single hexadecimal % digit, $0$ for control sequences. The \meta{char code} is given as a % decimal number, $-1$ for control sequences. % % Using delimited arguments lets us build the \meta{tokens} % progressively when doing an encoding conversion in \pkg{l3str-convert}. On the % other hand, the delimiter \cs{s_@@} may not appear unbraced in % \meta{tokens}. This is not a problem because we are careful to wrap % control sequences in braces (as an argument to \cs{exp_not:n}) when % converting from a general token list to the internal format. % % The current rule for converting a \meta{token} to a balanced set of % \meta{tokens} which both \texttt{o}-expands and \texttt{e}/\texttt{x}-expands to % it is the following. % \begin{itemize} % \item A control sequence |\cs| becomes |\exp_not:n { \cs }| % \cs{s_@@} $0$ $-1$ \cs{s_@@}. % \item A begin-group character |{| becomes \cs{exp_after:wN} |{| % \cs{if_false:} |}| \cs{fi:} \cs{s_@@} $1$ \meta{char code} % \cs{s_@@}. % \item An end-group character |}| becomes \cs{if_false:} |{| \cs{fi:} % |}| \cs{s_@@} $2$ \meta{char code} \cs{s_@@}. % \item A character with any other category code becomes % \cs{exp_not:n} \Arg{character} \cs{s_@@} \meta{hex catcode} % \meta{char code} \cs{s_@@}. % \end{itemize} % In contrast, for \cs{peek_analysis_map_inline:n} we must allow for an % input stream containing \tn{outer} macros, so that wrapping all % control sequences in \cs{exp_not:n} is unsafe. Instead, we write the % more elaborate \cs{__kernel_exp_not:w} \cs{exp_after:wN} |{| % \cs{exp_not:N} |\cs| |}|. (On the other hand we make a better effort % by avoiding \cs{exp_not:n} for characters other than active and macro % parameters.) % % \begin{macrocode} %<*package> % \end{macrocode} % % \subsection{Variables and helper functions} % % \begin{variable}{\s_@@} % The scan mark \cs{s_@@} is used as a delimiter in the internal % format. This is more practical than using a quark, because we would % then need to control expansion much more carefully: compare % \cs{int_value:w} |`#1| \cs{s_@@} with \cs{int_value:w} |`#1| % \cs{exp_stop_f:} \cs{exp_not:N} \cs{q_mark} to extract a character % code followed by the delimiter in an \texttt{e}-expansion. % \begin{macrocode} \scan_new:N \s_@@ % \end{macrocode} % \end{variable} % % \begin{variable} % {\l_@@_analysis_token, \l_@@_analysis_char_token} % The tokens in the token list are probed with the \TeX{} primitive % \tn{futurelet}. We use \cs{l_@@_analysis_token} in that % construction. In some cases, we convert the following token to a % string before probing it: then the token variable used is % \cs{l_@@_analysis_char_token}. % \begin{macrocode} \cs_new_eq:NN \l_@@_analysis_token ? \cs_new_eq:NN \l_@@_analysis_char_token ? % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_peek_code_tl} % Holds some code to be run once the next token has been fully % analysed in \cs{peek_analysis_map_inline:n}. % \begin{macrocode} \tl_new:N \l_@@_peek_code_tl % \end{macrocode} % \end{variable} % % \begin{variable}{\c_@@_peek_catcodes_tl} % A token list containing the character number~$32$ (space) with all % possible category codes except $1$ and $2$ (begin-group and % end-group). Why $32$? Because some \LuaTeX{} versions only allow % creation of catcode~$10$ (space) tokens with this character code, so % that we decided to make \cs{char_generate:nn} refuse to create such % weird spaces as well. We do not include the macro parameter case % (catcode~$6$) because it cannot be used as a macro delimiter. % \begin{macrocode} \group_begin: \char_set_active_eq:NN \ \scan_stop: \tl_const:Ne \c_@@_peek_catcodes_tl { \char_generate:nn { 32 } { 3 } 3 \char_generate:nn { 32 } { 4 } 4 \char_generate:nn { 32 } { 7 } 7 \char_generate:nn { 32 } { 8 } 8 \c_space_tl \token_to_str:N A \char_generate:nn { 32 } { 11 } \token_to_str:N B \char_generate:nn { 32 } { 12 } \token_to_str:N C \char_generate:nn { 32 } { 13 } \token_to_str:N D } \group_end: % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_analysis_normal_int} % The number of normal (\texttt{N}-type argument) tokens since the % last special token. % \begin{macrocode} \int_new:N \l_@@_analysis_normal_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_analysis_index_int} % During the first pass, this is the index in the array being built. % During the second pass, it is equal to the maximum index in the % array from the first pass. % \begin{macrocode} \int_new:N \l_@@_analysis_index_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_analysis_nesting_int} % Nesting depth of explicit begin-group and end-group characters % during the first pass. This lets us detect the end of the token list % without a reserved end-marker. % \begin{macrocode} \int_new:N \l_@@_analysis_nesting_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_analysis_type_int} % When encountering special characters, we record their \enquote{type} % in this integer. % \begin{macrocode} \int_new:N \l_@@_analysis_type_int % \end{macrocode} % \end{variable} % % \begin{variable}{\g_@@_analysis_result_tl} % The result of the conversion is stored in this token list, with a % succession of items of the form % \begin{quote} % \meta{tokens} \cs{s_@@} \meta{catcode} \meta{char code} \cs{s_@@} % \end{quote} % \begin{macrocode} \tl_new:N \g_@@_analysis_result_tl % \end{macrocode} % \end{variable} % % \begin{macro}[EXP]{\@@_analysis_extract_charcode:} % \begin{macro}[EXP]{\@@_analysis_extract_charcode_aux:w} % Extracting the character code from the meaning of % \cs{l_@@_analysis_token}. This has no error checking, and should % only be assumed to work for begin-group and end-group character % tokens. It produces a number in the form |`|\meta{char}. % \begin{macrocode} \cs_new:Npn \@@_analysis_extract_charcode: { \exp_after:wN \@@_analysis_extract_charcode_aux:w \token_to_meaning:N \l_@@_analysis_token } \cs_new:Npn \@@_analysis_extract_charcode_aux:w #1 ~ #2 ~ { ` } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[EXP]{\@@_analysis_cs_space_count:NN} % \begin{macro}[EXP]{\@@_analysis_cs_space_count:w} % \begin{macro}[EXP]{\@@_analysis_cs_space_count_end:w} % Counts the number of spaces in the string representation of its % second argument, as well as the number of characters following the % last space in that representation, and feeds the two numbers as % semicolon-delimited arguments to the first argument. When this % function is used, the escape character is printable and non-space. % \begin{macrocode} \cs_new:Npn \@@_analysis_cs_space_count:NN #1 #2 { \exp_after:wN #1 \int_value:w \int_eval:w 0 \exp_after:wN \@@_analysis_cs_space_count:w \token_to_str:N #2 \fi: \@@_analysis_cs_space_count_end:w ; ~ ! } \cs_new:Npn \@@_analysis_cs_space_count:w #1 ~ { \if_false: #1 #1 \fi: + 1 \@@_analysis_cs_space_count:w } \cs_new:Npn \@@_analysis_cs_space_count_end:w ; #1 \fi: #2 ! { \exp_after:wN ; \int_value:w \str_count_ignore_spaces:n {#1} ; } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \subsection{Plan of attack} % % Our goal is to produce a token list of the form roughly % \begin{quote} % \meta{token 1} \cs{s_@@} \meta{catcode 1} \meta{char code 1} \cs{s_@@} \\ % \meta{token 2} \cs{s_@@} \meta{catcode 2} \meta{char code 2} \cs{s_@@} \\ % \ldots{} % \meta{token N} \cs{s_@@} \meta{catcode N} \meta{char code N} \cs{s_@@} % \end{quote} % Most but not all tokens can be grabbed as an undelimited % (\texttt{N}-type) argument by \TeX{}. The plan is to have a two pass % system. In the first pass, locate special tokens, and store them in % various \tn{toks} registers. In the second pass, which is done within % an \texttt{e}-expanding assignment, normal tokens are taken in as % \texttt{N}-type arguments, and special tokens are retrieved from the % \tn{toks} registers, and removed from the input stream by some means. % The whole process takes linear time, because we avoid building the % result one item at a time. % % We make the escape character printable (backslash, but this later % oscillates between slash and backslash): this allows us to % distinguish characters from control sequences. % % A token has two characteristics: its \tn{meaning}, and what it looks % like for \TeX{} when it is in scanning mode (\emph{e.g.}, when % capturing parameters for a macro). For our purposes, we distinguish % the following meanings: % \begin{itemize} % \item begin-group token (category code $1$), either space (character % code $32$), or non-space; % \item end-group token (category code $2$), either space (character % code $32$), or non-space; % \item space token (category code $10$, character code $32$); % \item anything else (then the token is always an \texttt{N}-type % argument). % \end{itemize} % The token itself can \enquote{look like} one of the following % \begin{itemize} % \item a non-active character, in which case its meaning is % automatically that associated to its character code and category % code, we call it \enquote{true} character; % \item an active character; % \item a control sequence. % \end{itemize} % The only tokens which are not valid \texttt{N}-type arguments are true % begin-group characters, true end-group characters, and true spaces. % We detect those characters by scanning ahead with \tn{futurelet}, % then distinguishing true characters from control sequences set equal % to them using the \tn{string} representation. % % The second pass is a simple exercise in expandable loops. % % \begin{macro}{\@@_analysis:n} % Everything is done within a group, and all definitions are % local. We use \cs{group_align_safe_begin/end:} to avoid problems in % case \cs{@@_analysis:n} is used within an alignment and its argument % contains alignment tab tokens. % \begin{macrocode} \cs_new_protected:Npn \@@_analysis:n #1 { \group_begin: \group_align_safe_begin: \@@_analysis_a:n {#1} \@@_analysis_b:n {#1} \group_align_safe_end: \group_end: } % \end{macrocode} % \end{macro} % % \subsection{Disabling active characters} % % \begin{macro}{\@@_analysis_disable:n} % Active characters can cause problems later on in the processing, so % we provide a way to disable them, by setting them to % \texttt{undefined}. Since Unicode contains too many characters to % loop over all of them, we instead do this whenever we encounter a % character. For \pTeX{} and \upTeX{} we skip characters beyond % $[0,255]$ because \tn{lccode} only allows those values. % \begin{macrocode} \group_begin: \char_set_catcode_active:N \^^@ \cs_new_protected:Npn \@@_analysis_disable:n #1 { \tex_lccode:D 0 = #1 \exp_stop_f: \tex_lowercase:D { \tex_let:D ^^@ } \tex_undefined:D } \bool_lazy_or:nnT { \sys_if_engine_ptex_p: } { \sys_if_engine_uptex_p: } { \cs_gset_protected:Npn \@@_analysis_disable:n #1 { \if_int_compare:w 256 > #1 \exp_stop_f: \tex_lccode:D 0 = #1 \exp_stop_f: \tex_lowercase:D { \tex_let:D ^^@ } \tex_undefined:D \fi: } } \group_end: % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_analysis_disable_char:N} % Similar to \cs{@@_analysis_disable:n}, but it receives a normal % character token, tests if that token is active (by turning it into % a space: the active space has been undefined at this point), and % if so, disables it. Even if the character is active and set equal % to a primitive conditional, nothing blows up. % Again, in \pTeX{} and \upTeX{} we skip characters beyond $[0,255]$, % which cannot be active anyways. % \begin{macrocode} \group_begin: \char_set_catcode_active:N \^^@ \cs_new_protected:Npn \@@_analysis_disable_char:N #1 { \tex_lccode:D `#1 = 32 \exp_stop_f: \tex_lowercase:D { \if_meaning:w #1 } \tex_undefined:D \tex_let:D #1 \tex_undefined:D \fi: } \bool_lazy_or:nnT { \sys_if_engine_ptex_p: } { \sys_if_engine_uptex_p: } { \cs_gset_protected:Npn \@@_analysis_disable_char:N #1 { \if_int_compare:w 256 > `#1 \exp_stop_f: \tex_lccode:D `#1 = 32 \exp_stop_f: \tex_lowercase:D { \if_meaning:w #1 } \tex_undefined:D \tex_let:D #1 \tex_undefined:D \fi: \fi: } } \group_end: % \end{macrocode} % \end{macro} % % \subsection{First pass} % % The goal of this pass is to detect special (non-\texttt{N}-type) tokens, % and count how many \texttt{N}-type tokens lie between special tokens. % Also, we wish to store some representation of each special token % in a \tn{toks} register. % % We have $11$ types of tokens: % \begin{itemize} % \item[1.] a true non-space begin-group character; % \item[2.] a true space begin-group character; % \item[3.] a true non-space end-group character; % \item[4.] a true space end-group character; % \item[5.] a true space blank space character; % \item[6.] an active character; % \item[7.] any other true character; % \item[8.] a control sequence equal to a begin-group token (category code $1$); % \item[9.] a control sequence equal to an end-group token (category code $2$); % \item[10.] a control sequence equal to a space token % (character code $32$, category code $10$); % \item[11.] any other control sequence. % \end{itemize} % Our first tool is \tn{futurelet}. This cannot distinguish % case $8$ from $1$ or $2$, nor case $9$ from $3$ or $4$, % nor case $10$ from case $5$. Those cases are later distinguished % by applying the \tn{string} primitive to the following token, % after possibly changing the escape character to ensure that % a control sequence's string representation cannot be mistaken % for the true character. % % In cases $6$, $7$, and $11$, the following token is a valid % \texttt{N}-type argument, so we grab it and distinguish the case % of a character from a control sequence: in the latter case, % \cs{str_tail:n} \Arg{token} is non-empty, because the % escape character is printable. % % \begin{macro}{\@@_analysis_a:n} % We read tokens one by one using \tn{futurelet}. % While performing the loop, we keep track of the number of % true begin-group characters minus the number of % true end-group characters in \cs{l_@@_analysis_nesting_int}. % This reaches $-1$ when we read the closing brace. % \begin{macrocode} \cs_new_protected:Npn \@@_analysis_a:n #1 { \@@_analysis_disable:n { 32 } \int_set:Nn \tex_escapechar:D { 92 } \int_zero:N \l_@@_analysis_normal_int \int_zero:N \l_@@_analysis_index_int \int_zero:N \l_@@_analysis_nesting_int \if_false: { \fi: \@@_analysis_a_loop:w #1 } \int_decr:N \l_@@_analysis_index_int } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_analysis_a_loop:w} % Read one character and check its type. % \begin{macrocode} \cs_new_protected:Npn \@@_analysis_a_loop:w { \tex_futurelet:D \l_@@_analysis_token \@@_analysis_a_type:w } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_analysis_a_type:w} % At this point, \cs{l_@@_analysis_token} holds the meaning % of the following token. We store in \cs{l_@@_analysis_type_int} % information about the meaning of the token ahead: % \begin{itemize} % \item 0 space token; % \item 1 begin-group token; % \item -1 end-group token; % \item 2 other. % \end{itemize} % The values $0$, $1$, $-1$ correspond to how much a true such % character changes the nesting level ($2$ is used only here, % and is irrelevant later). Then call the auxiliary for each case. % Note that nesting conditionals here is safe because we only skip % over \cs{l_@@_analysis_token} if it matches with one of the % character tokens (hence is not a primitive conditional). % \begin{macrocode} \cs_new_protected:Npn \@@_analysis_a_type:w { \l_@@_analysis_type_int = \if_meaning:w \l_@@_analysis_token \c_space_token 0 \else: \if_catcode:w \exp_not:N \l_@@_analysis_token \c_group_begin_token 1 \else: \if_catcode:w \exp_not:N \l_@@_analysis_token \c_group_end_token - 1 \else: 2 \fi: \fi: \fi: \exp_stop_f: \if_case:w \l_@@_analysis_type_int \exp_after:wN \@@_analysis_a_space:w \or: \exp_after:wN \@@_analysis_a_bgroup:w \or: \exp_after:wN \@@_analysis_a_safe:N \else: \exp_after:wN \@@_analysis_a_egroup:w \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_analysis_a_space:w} % \begin{macro}{\@@_analysis_a_space_test:w} % In this branch, the following token's meaning is a blank space. % Apply \tn{string} to that token: a true blank space gives a space, a % control sequence gives a result starting with the escape character, % an active character gives something else than a space since we % disabled the space. We grab as \cs{l_@@_analysis_char_token} the first % character of the string representation then test it in % \cs{@@_analysis_a_space_test:w}. % Also, since \cs{@@_analysis_a_store:} expects the special token to be % stored in the relevant \tn{toks} register, we do that. The extra % \cs{exp_not:n} is unnecessary of course, but it makes the treatment % of all tokens more homogeneous. % If we discover that the next token was actually a control sequence % or an active character % instead of a true space, then we step the counter of normal tokens. % We now have in front of us the whole string representation of % the control sequence, including potential spaces; those will appear % to be true spaces later in this pass. Hence, all other branches of % the code in this first pass need to consider the string representation, % so that the second pass does not need to test the meaning of tokens, % only strings. % \begin{macrocode} \cs_new_protected:Npn \@@_analysis_a_space:w { \tex_afterassignment:D \@@_analysis_a_space_test:w \exp_after:wN \cs_set_eq:NN \exp_after:wN \l_@@_analysis_char_token \token_to_str:N } \cs_new_protected:Npn \@@_analysis_a_space_test:w { \if_meaning:w \l_@@_analysis_char_token \c_space_token \tex_toks:D \l_@@_analysis_index_int { \exp_not:n { ~ } } \@@_analysis_a_store: \else: \int_incr:N \l_@@_analysis_normal_int \fi: \@@_analysis_a_loop:w } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_analysis_a_bgroup:w, \@@_analysis_a_egroup:w} % \begin{macro} % {\@@_analysis_a_group:nw, \@@_analysis_a_group_aux:w, \@@_analysis_a_group_auxii:w, \@@_analysis_a_group_test:w} % The token is most likely a true character token with catcode $1$ or % $2$, but it might be a control sequence, or an active character. % Optimizing for the first case, we store in a toks register some code % that expands to that token. Since we will turn what follows into % a string, we make sure the escape character is different from the % current character code (by switching between solidus and backslash). % To detect the special case of an active character let to the catcode % $1$ or~$2$ character with the same character code, we disable the % active character with that character code and re-test: if the % following token has become undefined we can in fact safely grab it. % We are finally ready to turn what follows to a string and test it. % This is one place where we need \cs{l_@@_analysis_char_token} to be a % separate control sequence from \cs{l_@@_analysis_token}, to compare them. % \begin{macrocode} \group_begin: \char_set_catcode_group_begin:N \^^@ % { \cs_new_protected:Npn \@@_analysis_a_bgroup:w { \@@_analysis_a_group:nw { \exp_after:wN ^^@ \if_false: } \fi: } } \char_set_catcode_group_end:N \^^@ \cs_new_protected:Npn \@@_analysis_a_egroup:w { \@@_analysis_a_group:nw { \if_false: { \fi: ^^@ } } % } \group_end: \cs_new_protected:Npn \@@_analysis_a_group:nw #1 { \tex_lccode:D 0 = \@@_analysis_extract_charcode: \scan_stop: \tex_lowercase:D { \tex_toks:D \l_@@_analysis_index_int {#1} } \if_int_compare:w \tex_lccode:D 0 = \tex_escapechar:D \int_set:Nn \tex_escapechar:D { 139 - \tex_escapechar:D } \fi: \@@_analysis_disable:n { \tex_lccode:D 0 } \tex_futurelet:D \l_@@_analysis_token \@@_analysis_a_group_aux:w } \cs_new_protected:Npn \@@_analysis_a_group_aux:w { \if_meaning:w \l_@@_analysis_token \tex_undefined:D \exp_after:wN \@@_analysis_a_safe:N \else: \exp_after:wN \@@_analysis_a_group_auxii:w \fi: } \cs_new_protected:Npn \@@_analysis_a_group_auxii:w { \tex_afterassignment:D \@@_analysis_a_group_test:w \exp_after:wN \cs_set_eq:NN \exp_after:wN \l_@@_analysis_char_token \token_to_str:N } \cs_new_protected:Npn \@@_analysis_a_group_test:w { \if_charcode:w \l_@@_analysis_token \l_@@_analysis_char_token \@@_analysis_a_store: \else: \int_incr:N \l_@@_analysis_normal_int \fi: \@@_analysis_a_loop:w } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_analysis_a_store:} % This function is called each time we meet a special token; % at this point, the \tn{toks} register \cs{l_@@_analysis_index_int} % holds a token list which expands to the given special token. % Also, the value of \cs{l_@@_analysis_type_int} indicates which case % we are in: % \begin{itemize} % \item -1 end-group character; % \item 0 space character; % \item 1 begin-group character. % \end{itemize} % We need to distinguish further the case of a space character % (code $32$) from other character codes, because those % behave differently in the second pass. Namely, after testing % the \tn{lccode} of $0$ (which holds the present character code) % we change the cases above to % \begin{itemize} % \item -2 space end-group character; % \item -1 non-space end-group character; % \item 0 space blank space character; % \item 1 non-space begin-group character; % \item 2 space begin-group character. % \end{itemize} % This has the property that non-space characters correspond to odd % values of \cs{l_@@_analysis_type_int}. The number of normal tokens until % here and the type of special token are packed into a \tn{skip} % register. Finally, we check whether we reached the last closing % brace, in which case we stop by disabling the looping function % (locally). % \begin{macrocode} \cs_new_protected:Npn \@@_analysis_a_store: { \tex_advance:D \l_@@_analysis_nesting_int \l_@@_analysis_type_int \if_int_compare:w \tex_lccode:D 0 = `\ \exp_stop_f: \tex_advance:D \l_@@_analysis_type_int \l_@@_analysis_type_int \fi: \tex_skip:D \l_@@_analysis_index_int = \l_@@_analysis_normal_int sp plus \l_@@_analysis_type_int sp \scan_stop: \int_incr:N \l_@@_analysis_index_int \int_zero:N \l_@@_analysis_normal_int \if_int_compare:w \l_@@_analysis_nesting_int = - \c_one_int \cs_set_eq:NN \@@_analysis_a_loop:w \scan_stop: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_analysis_a_safe:N} % \begin{macro}{\@@_analysis_a_cs:ww} % This should be the simplest case: since the upcoming token is safe, % we can simply grab it in a second pass. If the token is a single % character (including space), the \cs{if_charcode:w} test yields % true; we disable a potentially active character (that could % otherwise masquerade as the true character in the next pass) and we % count one \enquote{normal} token. On the other % hand, if the token is a control sequence, we should replace it by % its string representation for compatibility with other code % branches. Instead of slowly looping through the characters with % the main code, we use the knowledge of how the second pass works: % if the control sequence name contains no space, count that token % as a number of normal tokens equal to its string length. If the % control sequence contains spaces, they should be registered as % special characters by increasing \cs{l_@@_analysis_index_int} % (no need to carefully count character between each space), and % all characters after the last space should be counted in the % following sequence of \enquote{normal} tokens. % \begin{macrocode} \cs_new_protected:Npn \@@_analysis_a_safe:N #1 { \if_charcode:w \scan_stop: \exp_after:wN \use_none:n \token_to_str:N #1 \prg_do_nothing: \scan_stop: \exp_after:wN \use_i:nn \else: \exp_after:wN \use_ii:nn \fi: { \@@_analysis_disable_char:N #1 \int_incr:N \l_@@_analysis_normal_int } { \@@_analysis_cs_space_count:NN \@@_analysis_a_cs:ww #1 } \@@_analysis_a_loop:w } \cs_new_protected:Npn \@@_analysis_a_cs:ww #1; #2; { \if_int_compare:w #1 > \c_zero_int \tex_skip:D \l_@@_analysis_index_int = \int_eval:n { \l_@@_analysis_normal_int + 1 } sp \exp_stop_f: \tex_advance:D \l_@@_analysis_index_int #1 \exp_stop_f: \else: \tex_advance:D \fi: \l_@@_analysis_normal_int #2 \exp_stop_f: } % \end{macrocode} % \end{macro} % \end{macro} % % \subsection{Second pass} % % The second pass is an exercise in expandable loops. % All the necessary information is stored in \tn{skip} % and \tn{toks} registers. % % \begin{macro}{\@@_analysis_b:n} % \begin{macro}[EXP]{\@@_analysis_b_loop:w} % Start the loop with the index $0$. No need for an end-marker: % the loop stops by itself when the last index is read. % We repeatedly oscillate between reading long stretches % of normal tokens, and reading special tokens. % \begin{macrocode} \cs_new_protected:Npn \@@_analysis_b:n #1 { \__kernel_tl_gset:Nx \g_@@_analysis_result_tl { \@@_analysis_b_loop:w 0; #1 \prg_break_point: } } \cs_new:Npn \@@_analysis_b_loop:w #1; { \exp_after:wN \@@_analysis_b_normals:ww \int_value:w \tex_skip:D #1 ; #1 ; } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[EXP]{\@@_analysis_b_normals:ww} % \begin{macro}[EXP]{\@@_analysis_b_normal:wwN} % The first argument is the number of normal tokens which remain % to be read, and the second argument is the index in the array % produced in the first step. % A character's string representation is always one character long, % while a control sequence is always longer (we have set the escape % character to a printable value). In both cases, we leave % \cs{exp_not:n} \Arg{token} \cs{s_@@} in the input stream % (after \texttt{e}-expansion). Here, \cs{exp_not:n} is used % rather than \cs{exp_not:N} because |#3| could be % a macro parameter character or could be \cs{s_@@} % (which must be hidden behind braces in the result). % \begin{macrocode} \cs_new:Npn \@@_analysis_b_normals:ww #1; { \if_int_compare:w #1 = \c_zero_int \@@_analysis_b_special:w \fi: \@@_analysis_b_normal:wwN #1; } \cs_new:Npn \@@_analysis_b_normal:wwN #1; #2; #3 { \exp_not:n { \exp_not:n { #3 } } \s_@@ \if_charcode:w \scan_stop: \exp_after:wN \use_none:n \token_to_str:N #3 \prg_do_nothing: \scan_stop: \exp_after:wN \@@_analysis_b_char:Nn \exp_after:wN \@@_analysis_b_char_aux:nww \else: \exp_after:wN \@@_analysis_b_cs:Nww \fi: #3 #1; #2; } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[EXP]{\@@_analysis_b_char:Nn, \@@_analysis_b_char_aux:nww} % This function is called here with arguments % \cs{@@_analysis_b_char_aux:nww} and a normal character, while in the % peek analysis code it is called with \cs{use_none:n} and possibly a % space character, which is why the function has signature |Nn|. % If the normal token we grab is a character, leave % \meta{catcode} \meta{charcode} followed by \cs{s_@@} % in the input stream, and call \cs{@@_analysis_b_normals:ww} % with its first argument decremented. % \begin{macrocode} \cs_new:Npe \@@_analysis_b_char:Nn #1#2 { \exp_not:N \if_meaning:w #2 \exp_not:N \tex_undefined:D \token_to_str:N D \exp_not:N \else: \exp_not:N \if_catcode:w #2 \c_catcode_other_token \token_to_str:N C \exp_not:N \else: \exp_not:N \if_catcode:w #2 \c_catcode_letter_token \token_to_str:N B \exp_not:N \else: \exp_not:N \if_catcode:w #2 \c_math_toggle_token 3 \exp_not:N \else: \exp_not:N \if_catcode:w #2 \c_alignment_token 4 \exp_not:N \else: \exp_not:N \if_catcode:w #2 \c_math_superscript_token 7 \exp_not:N \else: \exp_not:N \if_catcode:w #2 \c_math_subscript_token 8 \exp_not:N \else: \exp_not:N \if_catcode:w #2 \c_space_token \token_to_str:N A \exp_not:N \else: 6 \exp_not:n { \fi: \fi: \fi: \fi: \fi: \fi: \fi: \fi: } #1 {#2} } \cs_new:Npn \@@_analysis_b_char_aux:nww #1 { \int_value:w `#1 \s_@@ \exp_after:wN \@@_analysis_b_normals:ww \int_value:w \int_eval:w - 1 + } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP]{\@@_analysis_b_cs:Nww} % \begin{macro}[EXP]{\@@_analysis_b_cs_test:ww} % If the token we grab is a control sequence, leave % |0 -1| (as category code and character code) in the input stream, % followed by \cs{s_@@}, % and call \cs{@@_analysis_b_normals:ww} with updated arguments. % \begin{macrocode} \cs_new:Npn \@@_analysis_b_cs:Nww #1 { 0 -1 \s_@@ \@@_analysis_cs_space_count:NN \@@_analysis_b_cs_test:ww #1 } \cs_new:Npn \@@_analysis_b_cs_test:ww #1 ; #2 ; #3 ; #4 ; { \exp_after:wN \@@_analysis_b_normals:ww \int_value:w \int_eval:w \if_int_compare:w #1 = \c_zero_int #3 \else: \tex_skip:D \int_eval:n { #4 + #1 } \exp_stop_f: \fi: - #2 \exp_after:wN ; \int_value:w \int_eval:n { #4 + #1 } ; } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[EXP]{\@@_analysis_b_special:w} % \begin{macro}[EXP]{\@@_analysis_b_special_char:wN} % \begin{macro}[EXP]{\@@_analysis_b_special_space:w} % Here, |#1| is the current index in the array built in the first pass. % Check now whether we reached the end (we shouldn't keep the trailing % end-group character that marked the end of the token list in the % first pass). % Unpack the \tn{toks} register: when \texttt{e}/\texttt{x}-expanding again, % we will get the special token. % Then leave the category code in the input stream, followed by % the character code, and call \cs{@@_analysis_b_loop:w} with the next index. % \begin{macrocode} \group_begin: \char_set_catcode_other:N A \cs_new:Npn \@@_analysis_b_special:w \fi: \@@_analysis_b_normal:wwN 0 ; #1 ; { \fi: \if_int_compare:w #1 = \l_@@_analysis_index_int \exp_after:wN \prg_break: \fi: \tex_the:D \tex_toks:D #1 \s_@@ \if_case:w \tex_gluestretch:D \tex_skip:D #1 \exp_stop_f: \token_to_str:N A \or: 1 \or: 1 \else: 2 \fi: \if_int_odd:w \tex_gluestretch:D \tex_skip:D #1 \exp_stop_f: \exp_after:wN \@@_analysis_b_special_char:wN \int_value:w \else: \exp_after:wN \@@_analysis_b_special_space:w \int_value:w \fi: \int_eval:n { 1 + #1 } \exp_after:wN ; \token_to_str:N } \group_end: \cs_new:Npn \@@_analysis_b_special_char:wN #1 ; #2 { \int_value:w `#2 \s_@@ \@@_analysis_b_loop:w #1 ; } \cs_new:Npn \@@_analysis_b_special_space:w #1 ; ~ { 32 \s_@@ \@@_analysis_b_loop:w #1 ; } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \subsection{Mapping through the analysis} % % \begin{macro}{\tl_analysis_map_inline:Nn, \tl_analysis_map_inline:nn} % \begin{macro}{\@@_analysis_map:Nn} % \begin{macro}{\@@_analysis_map:NwNw} % First obtain the analysis of the token list into % \cs{g_@@_analysis_result_tl}. To allow nested mappings, increase the % nesting depth \cs{g__kernel_prg_map_int} (shared between all % modules), then define the payload macro, which runs the user code % and has a name specific to that nesting depth. The looping macro % grabs the \meta{tokens}, \meta{catcode} and \meta{char code}; it % checks for the end of the loop with \cs{use_none:n} |##2|, normally % empty, but which becomes \cs{tl_map_break:} at the end; it then % calls the payload macro with the arguments in the correct order % (this is the reason why we cannot directly use the same macro for % looping and payload), and loops by calling itself. When the loop % ends, remember to decrease the nesting depth. % \begin{macrocode} \cs_new_protected:Npn \tl_analysis_map_inline:Nn #1 { \exp_args:No \tl_analysis_map_inline:nn #1 } \cs_new_protected:Npn \tl_analysis_map_inline:nn #1 { \@@_analysis:n {#1} \int_gincr:N \g__kernel_prg_map_int \exp_args:Nc \@@_analysis_map:Nn { @@_analysis_map_inline_ \int_use:N \g__kernel_prg_map_int :wNw } } \cs_new_protected:Npn \@@_analysis_map:Nn #1#2 { \cs_gset_protected:Npn #1 ##1##2##3 {#2} \exp_after:wN \@@_analysis_map:NwNw \exp_after:wN #1 \g_@@_analysis_result_tl \s_@@ { ? \tl_map_break: } \s_@@ \prg_break_point:Nn \tl_map_break: { \int_gdecr:N \g__kernel_prg_map_int } } \cs_new_protected:Npn \@@_analysis_map:NwNw #1 #2 \s_@@ #3 #4 \s_@@ { \use_none:n #3 #1 {#2} {#4} {#3} \@@_analysis_map:NwNw #1 } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \subsection{Showing the results} % % \begin{macro}{\tl_analysis_show:N, \tl_analysis_log:N, \@@_analysis_show:NNN} % Add to \cs{@@_analysis:n} a third pass to display tokens to the terminal. % If the token list variable is not defined, throw the same error % as \cs{tl_show:N} by simply calling that function. % \begin{macrocode} \cs_new_protected:Npn \tl_analysis_show:N { \@@_analysis_show:NNN \msg_show:nneeee \tl_show:N } \cs_new_protected:Npn \tl_analysis_log:N { \@@_analysis_show:NNN \msg_log:nneeee \tl_log:N } \cs_new_protected:Npn \@@_analysis_show:NNN #1#2#3 { \tl_if_exist:NTF #3 { \exp_args:No \@@_analysis:n {#3} #1 { tl } { show-analysis } { \token_to_str:N #3 } { \@@_analysis_show: } { } { } } { #2 #3 } } % \end{macrocode} % \end{macro} % % \begin{macro}{\tl_analysis_show:n, \tl_analysis_log:n, \@@_analysis_show:Nn} % No existence test needed here. % \begin{macrocode} \cs_new_protected:Npn \tl_analysis_show:n { \@@_analysis_show:Nn \msg_show:nneeee } \cs_new_protected:Npn \tl_analysis_log:n { \@@_analysis_show:Nn \msg_log:nneeee } \cs_new_protected:Npn \@@_analysis_show:Nn #1#2 { \@@_analysis:n {#2} #1 { tl } { show-analysis } { } { \@@_analysis_show: } { } { } } % \end{macrocode} % \end{macro} % % \begin{macro}[rEXP]{\@@_analysis_show:, \@@_analysis_show_loop:wNw} % Here, |#1| \texttt{o}- and \texttt{e}/\texttt{x}-expands to the token; % |#2| is the category code (one uppercase hexadecimal digit), % $0$ for control sequences; % |#3| is the character code, which we ignore. % In the cases of control sequences and active characters, % the meaning may overflow one line, and we want to truncate % it. Those cases are thus separated out. % \begin{macrocode} \cs_new:Npn \@@_analysis_show: { \exp_after:wN \@@_analysis_show_loop:wNw \g_@@_analysis_result_tl \s_@@ { ? \prg_break: } \s_@@ \prg_break_point: } \cs_new:Npn \@@_analysis_show_loop:wNw #1 \s_@@ #2 #3 \s_@@ { \use_none:n #2 \iow_newline: > \use:nn { ~ } { ~ } \if_int_compare:w "#2 = \c_zero_int \exp_after:wN \@@_analysis_show_cs:n \else: \if_int_compare:w "#2 = 13 \exp_stop_f: \exp_after:wN \exp_after:wN \exp_after:wN \@@_analysis_show_active:n \else: \exp_after:wN \exp_after:wN \exp_after:wN \@@_analysis_show_normal:n \fi: \fi: {#1} \@@_analysis_show_loop:wNw } % \end{macrocode} % \end{macro} % % \begin{macro}[rEXP]{\@@_analysis_show_normal:n} % Non-active characters are a simple matter of printing % the character, and its meaning. Our test suite checks that % begin-group and end-group characters do not mess up % \TeX{}'s alignment status. % \begin{macrocode} \cs_new:Npn \@@_analysis_show_normal:n #1 { \exp_after:wN \token_to_str:N #1 ~ ( \exp_after:wN \token_to_meaning:N #1 ) } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP]{\@@_analysis_show_value:N} % This expands to the value of |#1| if it has any. % \begin{macrocode} \cs_new:Npn \@@_analysis_show_value:N #1 { \token_if_expandable:NF #1 { \token_if_chardef:NTF #1 \prg_break: { } \token_if_mathchardef:NTF #1 \prg_break: { } \token_if_dim_register:NTF #1 \prg_break: { } \token_if_int_register:NTF #1 \prg_break: { } \token_if_skip_register:NTF #1 \prg_break: { } \token_if_toks_register:NTF #1 \prg_break: { } \use_none:nnn \prg_break_point: \use:n { \exp_after:wN = \tex_the:D #1 } } } % \end{macrocode} % \end{macro} % % \begin{macro}[rEXP]{\@@_analysis_show_cs:n} % \begin{macro}[rEXP]{\@@_analysis_show_active:n} % \begin{macro}[rEXP]{\@@_analysis_show_long:nn} % \begin{macro}[rEXP]{\@@_analysis_show_long_aux:nnnn} % Control sequences and active characters are printed in the same way, % making sure not to go beyond the \cs{l_iow_line_count_int}. In case % of an overflow, we replace the last characters by % \cs{c_@@_analysis_show_etc_str}. % \begin{macrocode} \cs_new:Npn \@@_analysis_show_cs:n #1 { \exp_args:No \@@_analysis_show_long:nn {#1} { control~sequence= } } \cs_new:Npn \@@_analysis_show_active:n #1 { \exp_args:No \@@_analysis_show_long:nn {#1} { active~character= } } \cs_new:Npn \@@_analysis_show_long:nn #1 { \@@_analysis_show_long_aux:oofn { \token_to_str:N #1 } { \token_to_meaning:N #1 } { \@@_analysis_show_value:N #1 } } \cs_new:Npn \@@_analysis_show_long_aux:nnnn #1#2#3#4 { \int_compare:nNnTF { \str_count:n { #1 ~ ( #4 #2 #3 ) } } > { \l_iow_line_count_int - 3 } { \str_range:nnn { #1 ~ ( #4 #2 #3 ) } { 1 } { \l_iow_line_count_int - 3 - \str_count:N \c_@@_analysis_show_etc_str } \c_@@_analysis_show_etc_str } { #1 ~ ( #4 #2 #3 ) } } \cs_generate_variant:Nn \@@_analysis_show_long_aux:nnnn { oof } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % \end{macro} % % \subsection{Peeking ahead} % % \begin{macro}[EXP]{\peek_analysis_map_break:, \peek_analysis_map_break:n} % The break statements use the general \cs{prg_map_break:Nn}. % \begin{macrocode} \cs_new:Npn \peek_analysis_map_break: { \prg_map_break:Nn \peek_analysis_map_break: { } } \cs_new:Npn \peek_analysis_map_break:n { \prg_map_break:Nn \peek_analysis_map_break: } % \end{macrocode} % \end{macro} % % \begin{variable}{\l_@@_peek_charcode_int} % \begin{macrocode} \int_new:N \l_@@_peek_charcode_int % \end{macrocode} % \end{variable} % % \begin{macro}{\@@_analysis_char_arg:Nw, \@@_analysis_char_arg_aux:Nw} % After a call to \tn{futurelet} \cs{l_@@_analysis_token} followed by % a stringified character token (either explicit space or catcode % other character), grab the argument and pass it to |#1|. We only % need to do anything in the case of a space. % \begin{macrocode} \cs_new:Npn \@@_analysis_char_arg:Nw { \if_meaning:w \l_@@_analysis_token \c_space_token \exp_after:wN \@@_analysis_char_arg_aux:Nw \fi: } \cs_new:Npn \@@_analysis_char_arg_aux:Nw #1 ~ { #1 { ~ } } % \end{macrocode} % \end{macro} % % \begin{macro} % { % \peek_analysis_map_inline:n, % \@@_peek_analysis_loop:NNn, \@@_peek_analysis_test:, % \@@_peek_analysis_exp:N, \@@_peek_analysis_exp_aux:N, % \@@_peek_analysis_nonexp:N, \@@_peek_analysis_cs:N, % \@@_peek_analysis_char:N, \@@_peek_analysis_char:w, % \@@_peek_analysis_special:, \@@_peek_analysis_retest:, % \@@_peek_analysis_str:, % \@@_peek_analysis_str:w, \@@_peek_analysis_str:n, % \@@_peek_analysis_active_str:n, \@@_peek_analysis_explicit:n, % \@@_peek_analysis_escape:, \@@_peek_analysis_collect:w, % \@@_peek_analysis_collect:n, \@@_peek_analysis_collect_loop:, % \@@_peek_analysis_collect_test:, \@@_peek_analysis_collect_end:NNNN % } % Save the user's code in a control sequence that is suitable for % nested maps. We may wish to pass to this function an \tn{outer} % control sequence or active character; for this we will undefine % any expandable token (testing if it is \tn{outer} is much slower) % within a group, closed immediately after the function reads its % arguments to avoid affecting the user's code or even our peek code % (there is no risk of undefining \cs{group_end:} itself since that is % not expandable). This user's code function also % calls the loop auxiliary, and includes the trailing % \cs{prg_break_point:Nn} for when the user wants to stop the loop. % The loop auxiliary must remove that break point because it must look % at the input stream. % \begin{macrocode} \cs_new_protected:Npn \peek_analysis_map_inline:n #1 { \group_align_safe_begin: \int_gincr:N \g__kernel_prg_map_int \cs_set_protected:cpn { @@_analysis_map_ \int_use:N \g__kernel_prg_map_int :nnN } ##1##2##3 { \group_end: #1 \@@_peek_analysis_loop:NNn \prg_break_point:Nn \peek_analysis_map_break: { \int_gdecr:N \g__kernel_prg_map_int \group_align_safe_end: } } \@@_peek_analysis_loop:NNn ? ? ? } % \end{macrocode} % The loop starts a group (closed by the user-code function defined % above) with a normalized escape character, and checks if the next % token is special or \texttt{N}-type (distinguishing expandable from % non-expandable tokens). The test for nonexpandable tokens in % \cs{@@_peek_analysis_test:} must be done after the tests for % begin-group, end-group, and space tokens, in case \cs{l_peek_token} % is either \tn{outer} or is a primitive \TeX{} conditional, as such % tokens cannot be skipped over correctly by conditional code. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_analysis_loop:NNn #1#2#3 { \group_begin: \tl_set:Ne \l_@@_peek_code_tl { \exp_not:c { @@_analysis_map_ \int_use:N \g__kernel_prg_map_int :nnN } } \int_set:Nn \tex_escapechar:D { `\\ } \peek_after:Nw \@@_peek_analysis_test: } \cs_new_protected:Npn \@@_peek_analysis_test: { \if_case:w \if_catcode:w \exp_not:N \l_peek_token { \c_max_int \fi: \if_catcode:w \exp_not:N \l_peek_token } \c_max_int \fi: \if_meaning:w \l_peek_token \c_space_token \c_max_int \fi: \exp_after:wN \if_meaning:w \exp_not:N \l_peek_token \l_peek_token \c_one_int \fi: \c_zero_int \exp_after:wN \exp_after:wN \exp_after:wN \@@_peek_analysis_exp:N \exp_after:wN \exp_not:N \or: \exp_after:wN \@@_peek_analysis_nonexp:N \else: \exp_after:wN \@@_peek_analysis_special: \fi: } % \end{macrocode} % Expandable tokens (which are automatically |N|-type) can be % \tn{outer} macros, hence the need for \cs{exp_after:wN} and % \cs{exp_not:N} in the code above, which allows the next function to % safely grab the token as an argument. To allow the % possibly-\tn{outer} token~|#1| as an argument of the \meta{user's % function} (which is protected and stored in \cs{l_@@_peek_code_tl}), % we set it equal to a harmless macro. This must be done at the very % last minute because |#1| may be some pretty important function such % as \cs{exp_after:wN}. Using a primitive \cs{cs_set_nopar:Npe} % expansion (to avoid \tn{outer} problems) we set up to run the code % \tn{let} |#1| \meta{user's function} \meta{user's function} followed % by arguments involving~|#1|. Regardless of~|#1| (including the % user's function itself), the user's function is run. It always % starts with \cs{group_end:}, which has not been redefined since |#1| % started out as expandable, and which restores the definition of~|#1|. % % Then we put the elaborate first argument % \cs{__kernel_exp_not:w} \cs{exp_after:wN} |{| \cs{exp_not:N} |#1| |}|: % indeed we cannot use \cs{exp_not:n} |{#1}| as this breaks for an % \tn{outer} macro and we cannot use \cs{exp_not:N} |#1|, as % \texttt{o}-expanding this yields a \enquote{notexpanded} token equal % to (a weird) \tn{relax}, which would have the wrong value for % primitive \TeX{} conditionals such as \cs{if_meaning:w}. % % Then we must add |{-1}0| if the token is a % control sequence and \Arg{charcode}|D| otherwise. Distinguishing % the two cases is easy: since we have made the escape character % printable, \cs{token_to_str:N} gives at least two characters for a % control sequence versus a single one for an active character % (possibly being a space, in which case the trailing brace group is % taken as the first argument of \cs{@@_peek_analysis_exp_aux:Nw}). % Importantly, |#1| could be an \tn{outer} token (as it is only set to % \cs{scan_stop:} at the last minute) but once we apply % \cs{token_to_str:N} we no longer need to worry about it. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_analysis_exp:N #1 { \cs_set_nopar:Npe \l_@@_peek_code_tl { \tex_let:D \exp_not:N #1 \l_@@_peek_code_tl \l_@@_peek_code_tl { \exp_not:n { \__kernel_exp_not:w \exp_after:wN } { \exp_not:N \exp_not:N \exp_not:N #1 } } \exp_after:wN \@@_peek_analysis_exp_aux:Nw \token_to_str:N #1 { } \s_@@ } \l_@@_peek_code_tl } \cs_new:Npe \@@_peek_analysis_exp_aux:Nw #1#2 \s_@@ { \exp_not:N \if_meaning:w \scan_stop: #2 \scan_stop: { \exp_not:N \int_value:w `#1 ~ } \token_to_str:N D \exp_not:N \else: { -1 } 0 \exp_not:N \fi: } % \end{macrocode} % For normal non-expandable tokens we must distinguish characters % (including active ones and macro parameter characters) from control % sequences (whose string representation is more than one character % because we made the escape character printable). For a control % sequence call the user code with suitable arguments, wrapping |#1| % within \cs{exp_not:n} just in case it happens to be equal to a macro % parameter character. We do not skip \cs{exp_not:n} when % unnecessary, because this auxiliary is also called in % \cs{@@_peek_analysis_retest:} where we have changed some control % sequences or active characters to \cs{scan_stop:} temporarily. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_analysis_nonexp:N #1 { \if_charcode:w \scan_stop: \exp_after:wN \use_none:n \token_to_str:N #1 \prg_do_nothing: \scan_stop: \exp_after:wN \@@_peek_analysis_char:N \else: \exp_after:wN \@@_peek_analysis_cs:N \fi: #1 } \cs_new_protected:Npn \@@_peek_analysis_cs:N #1 { \l_@@_peek_code_tl { \exp_not:n {#1} } { -1 } 0 } % \end{macrocode} % For normal characters we must determine their catcode. The main % difficulty is that the character may be an active character % masquerading as (i.e., set equal to) itself with a different % catcode. Two approaches based on \tn{lowercase} can detect this. % One could make an active character with the same catcode as~|#1| and % change its definition before testing the catcode of~|#1|, but in % some Unicode engine this fills up the hash table uselessly. % Instead, we lowercase~|#1| itself, changing its character code % to~$32$, namely space (because \LuaTeX{} cannot turn catcode~$10$ % characters to anything else than character code~$32$), then we apply % \cs{@@_analysis_b_char:Nn}, which detects active characters by % comparing them to \cs{tex_undefined:D}, and we must have undefined % the active space (locally) for this test to work. % To define \cs{@@_peek_analysis_char:N} itself we use an % |e|-expanding assignment to get the active space in the right place % after making it (just for this definition) unexpandable. % Finally \cs{@@_peek_analysis_char:w} receives the \meta{charcode}, % \meta{user function}, \meta{catcode}, and \meta{token}, and places % the arguments in the correct order. It keeps \cs{exp_not:n} for % macro parameter characters and active characters (the latter could % be macro parameter characters, and it seems more uniform to always % put \cs{exp_not:n}), and otherwise eliminates it by expanding once % with \cs{exp_args:NNNo}. % \begin{macrocode} \group_begin: \char_set_active_eq:NN \ \scan_stop: \cs_new_protected:Npe \@@_peek_analysis_char:N #1 { \cs_set_eq:NN \char_generate:nn { 32 } { 13 } \exp_not:N \tex_undefined:D \tex_lccode:D `#1 = 32 \exp_stop_f: \tex_lowercase:D { \tl_put_right:Ne \exp_not:N \l_@@_peek_code_tl { \exp_not:n { \@@_analysis_b_char:Nn \use_none:n } {#1} } } \exp_not:n { \exp_after:wN \@@_peek_analysis_char:w \int_value:w } `#1 \exp_not:n { \exp_after:wN \s_@@ \l_@@_peek_code_tl } #1 } \group_end: \cs_new_protected:Npn \@@_peek_analysis_char:w #1 \s_@@ #2#3#4 { \if_charcode:w 6 #3 \else: \if_charcode:w D #3 \else: \exp_args:NNNo \fi: \fi: #2 { \exp_not:n {#4} } {#1} #3 } % \end{macrocode} % For special characters the idea is to eventually act with % \cs{token_to_str:N}, then pick up one by one the characters of this % string representation until hitting the token that follows. First % determine the character code of (the meaning of) the \meta{token} % (which we know is a special token), make sure the escape character % is different from it, normalize the meanings of two active % characters and the empty control sequence, and filter out these % cases in \cs{@@_peek_analysis_retest:}. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_analysis_special: { \tex_let:D \l_@@_analysis_token = ~ \l_peek_token \int_set:Nn \l_@@_peek_charcode_int { \@@_analysis_extract_charcode: } \if_int_compare:w \l_@@_peek_charcode_int = \tex_escapechar:D \int_set:Nn \tex_escapechar:D { `\/ } \fi: \char_set_active_eq:nN { \l_@@_peek_charcode_int } \scan_stop: \char_set_active_eq:nN { \tex_escapechar:D } \scan_stop: \cs_set_eq:cN { } \scan_stop: \tex_futurelet:D \l_@@_analysis_token \@@_peek_analysis_retest: } \cs_new_protected:Npn \@@_peek_analysis_retest: { \if_meaning:w \l_@@_analysis_token \scan_stop: \exp_after:wN \@@_peek_analysis_nonexp:N \else: \exp_after:wN \@@_peek_analysis_str: \fi: } % \end{macrocode} % At this point we know the meaning of the \meta{token} in the input % stream is \cs{l_peek_token}, either a space (32, 10) or a % begin-group or end-group token (catcode $1$ or~$2$), and we excluded % a few cases that would be difficult later (empty control sequence, % active character with the same character code as its meaning or as % the escape character). The idea is to apply \cs{token_to_str:N} to % the \meta{token} then grab characters (of category code~$12$ except % for spaces that have category code~$10$) to reconstruct it. In % earlier versions of the code we would peek at the \meta{next token} % that lies after \meta{token} in the input stream, which would help % us be more accurate in reconstructing the \meta{token} case in edge % cases (mentioned below), but this had the side-effect of tokenizing % the input stream (turning characters into tokens) farther ahead than % needed. % % We hit the \meta{token} with \cs{token_to_str:N} and start grabbing % characters. More % precisely, by looking at the first character in the string % representation of the \meta{token} we distinguish three cases: % a stringified control sequence starts with the escape character; for % an explicit character we find that same character; for an active % character we find anything else (we made sure to exclude the case of % an active character whose string representation coincides with the % other two cases). % \begin{macrocode} \cs_new_protected:Npn \@@_peek_analysis_str: { \exp_after:wN \tex_futurelet:D \exp_after:wN \l_@@_analysis_token \exp_after:wN \@@_peek_analysis_str:w \token_to_str:N } \cs_new_protected:Npn \@@_peek_analysis_str:w { \@@_analysis_char_arg:Nw \@@_peek_analysis_str:n } \cs_new_protected:Npn \@@_peek_analysis_str:n #1 { \int_case:nnF { `#1 } { { \l_@@_peek_charcode_int } { \@@_peek_analysis_explicit:n {#1} } { \tex_escapechar:D } { \@@_peek_analysis_escape: } } { \@@_peek_analysis_active_str:n {#1} } } % \end{macrocode} % When |#1| is a stringified active character we pass appropriate % arguments to the user's code; thankfully \cs{char_generate:nn} % can make active characters. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_analysis_active_str:n #1 { \tl_put_right:Ne \l_@@_peek_code_tl { { \char_generate:nn { `#1 } { 13 } } { \int_value:w `#1 } \token_to_str:N D } \l_@@_peek_code_tl } % \end{macrocode} % When |#1| matches the character we had extracted from the meaning of % \cs{l_peek_token}, the token was an explicit character, which can be % a standard space, or a begin-group or end-group character with some % character code. In the latter two cases we call % \cs{char_generate:nn} with suitable arguments and put suitable % \cs{if_false:} \cs{fi:} constructions to make the result balanced % and such that \texttt{o}-expanding or \texttt{e}/\texttt{x}-expanding gives % back a single (unbalanced) begin-group or end-group character. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_analysis_explicit:n #1 { \tl_put_right:Ne \l_@@_peek_code_tl { \if_meaning:w \l_peek_token \c_space_token { ~ } { 32 } \token_to_str:N A \else: \if_catcode:w \l_peek_token \c_group_begin_token { \exp_not:N \exp_after:wN \char_generate:nn { `#1 } { 1 } \exp_not:N \if_false: \if_false: { \fi: } \exp_not:N \fi: } { \int_value:w `#1 } 1 \else: { \exp_not:N \if_false: { \if_false: } \fi: \exp_not:N \fi: \char_generate:nn { `#1 } { 2 } } { \int_value:w `#1 } 2 \fi: \fi: } \l_@@_peek_code_tl } % \end{macrocode} % Finally there is the case of a special token whose string % representation starts with an escape character, namely the token was % a control sequence. In that case we could have grabbed the token % directly as an \texttt{N}-type argument, but of course we couldn't % know that until we had run all the various tests including % stringifying the token. We are thus left with the hard work of % picking up one by one the characters in the csname (being careful % about spaces), until the constructed csname has the expected % meaning. This fails if someone defines a token like % \cs[no-index]{bgroup@my} whose string representation starts the same % as another token with the same meaning being an implicit character % token of category code $1$, $2$, or $10$. % \begin{macrocode} \cs_new_protected:Npn \@@_peek_analysis_escape: { \tl_clear:N \l_@@_internal_a_tl \tex_futurelet:D \l_@@_analysis_token \@@_peek_analysis_collect:w } \cs_new_protected:Npn \@@_peek_analysis_collect:w { \@@_analysis_char_arg:Nw \@@_peek_analysis_collect:n } \cs_new_protected:Npn \@@_peek_analysis_collect:n #1 { \tl_put_right:Nn \l_@@_internal_a_tl {#1} \@@_peek_analysis_collect_loop: } \cs_new_protected:Npn \@@_peek_analysis_collect_loop: { \exp_after:wN \if_meaning:w \cs:w \if_cs_exist:w \l_@@_internal_a_tl \cs_end: \l_@@_internal_a_tl \else: c_one % anything short \fi: \cs_end: \l_peek_token \@@_peek_analysis_collect_end:NNNN \fi: \tex_futurelet:D \l_@@_analysis_token \@@_peek_analysis_collect:w } % \end{macrocode} % As in all other cases, end by calling the user code with suitable % arguments (here |#1| is \cs{fi:}). % \begin{macrocode} \cs_new_protected:Npn \@@_peek_analysis_collect_end:NNNN #1#2#3#4 { #1 \tl_put_right:Ne \l_@@_peek_code_tl { { \exp_not:N \exp_not:n { \exp_not:c { \l_@@_internal_a_tl } } } { -1 } 0 } \l_@@_peek_code_tl } % \end{macrocode} % \end{macro} % % \subsection{Messages} % % \begin{variable}{\c_@@_analysis_show_etc_str} % When a control sequence (or active character) % and its meaning are too long to fit in one line % of the terminal, the end is replaced by this token list. % \begin{macrocode} \tl_const:Ne \c_@@_analysis_show_etc_str % ( { \token_to_str:N \ETC.) } % \end{macrocode} % \end{variable} % % \begin{macrocode} \msg_new:nnn { tl } { show-analysis } { The~token~list~ \tl_if_empty:nF {#1} { #1 ~ } \tl_if_empty:nTF {#2} { is~empty } { contains~the~tokens: #2 } } % \end{macrocode} % % \begin{macrocode} % % \end{macrocode} % % \end{implementation} % % \PrintIndex