src/lib/html/html-elements-g.pkg

15.4.810 src/lib/html/html-elements-g.pkg

## html-elements-g.pkg

# Compiled by:
#     src/lib/html/html.lib

# This module builds element tags for the lexer.

generic package html_elements_g (
    package tokens:     Html_Tokens;            # Html_Tokens   is from   src/lib/html/html.grammar.api
    package err:        Html_Error;             # Html_Error    is from   src/lib/html/html-error.api
    package htmlattrs:  Html_Attributes;                # Html_Attributes       is from   src/lib/html/html-attributes.api
  )
: (weak)
api {
    package t:  Html_Tokens;                    # Html_Tokens   is from   src/lib/html/html.grammar.api

    Source_Position = Int;

    start_tag: Null_Or( String ) -> ((String, Source_Position, Source_Position)) -> Null_Or( t::Token( t::Semantic_Value, Source_Position ) );
    end_tag:   Null_Or( String ) -> ((String, Source_Position, Source_Position)) -> Null_Or( t::Token( t::Semantic_Value, Source_Position ) );

}
{
    package t = tokens;
    package a = htmlattrs;

    Source_Position = Int;

    Start_Tag
        = WATTRS   (((a::Attributes, Source_Position, Source_Position)) ->  t::Token (t::Semantic_Value, Source_Position))
        | WOATTRS  (((Source_Position, Source_Position)) ->  t::Token (t::Semantic_Value, Source_Position));

    End_Tag
        = END  (((Source_Position, Source_Position)) ->  t::Token (t::Semantic_Value, Source_Position))
        | EMPTY;

    token_data = [
            ("A",               WATTRS t::start_a,              END t::end_a),
            ("ADDRESS",         WOATTRS t::start_address,       END t::end_address),
            ("APPLET",          WATTRS t::start_applet,         END t::end_applet),
            ("AREA",            WATTRS t::tag_area,             EMPTY),
            ("B",               WOATTRS t::start_b,             END t::end_b),
            ("BASE",            WATTRS t::tag_base,             EMPTY),
            ("BASEFONT",        WATTRS t::start_basefont,       END t::end_basefont),
            ("BIG",             WOATTRS t::start_big,           END t::end_big),
            ("BLOCKQUOTE",      WOATTRS t::start_blockquote,    END t::end_blockquote),
            ("BODY",            WATTRS t::start_body,           END t::end_body),
            ("BR",              WATTRS t::tag_br,               EMPTY),
            ("CAPTION",         WATTRS t::start_caption,        END t::end_caption),
            ("CENTER",          WOATTRS t::start_center,        END t::end_center),
            ("CITE",            WOATTRS t::start_cite,          END t::end_cite),
            ("CODE",            WOATTRS t::start_code,          END t::end_code),
            ("DD",              WOATTRS t::start_dd,            END t::end_dd),
            ("DFN",             WOATTRS t::start_dfn,           END t::end_dfn),
            ("DIR",             WATTRS t::start_dir,            END t::end_dir),
            ("DIV",             WATTRS t::start_div,            END t::end_div),
            ("DL",              WATTRS t::start_dl,             END t::end_dl),
            ("DT",              WOATTRS t::start_dt,            END t::end_dt),
            ("EM",              WOATTRS t::start_em,            END t::end_em),
            ("FONT",            WATTRS t::start_font,           END t::end_font),
            ("FORM",            WATTRS t::start_form,           END t::end_form),
            ("H1",              WATTRS t::start_h1,             END t::end_h1),
            ("H2",              WATTRS t::start_h2,             END t::end_h2),
            ("H3",              WATTRS t::start_h3,             END t::end_h3),
            ("H4",              WATTRS t::start_h4,             END t::end_h4),
            ("H5",              WATTRS t::start_h5,             END t::end_h5),
            ("H6",              WATTRS t::start_h6,             END t::end_h6),
            ("HEAD",            WOATTRS t::start_head,          END t::end_head),
            ("HR",              WATTRS t::tag_hr,               EMPTY),
            ("HTML",            WOATTRS t::start_html,          END t::end_html),
            ("I",               WOATTRS t::start_i,             END t::end_i),
            ("IMG",             WATTRS t::tag_img,              EMPTY),
            ("INPUT",           WATTRS t::tag_input,            EMPTY),
            ("ISINDEX",         WATTRS t::tag_isindex,          EMPTY),
            ("KBD",             WOATTRS t::start_kbd,           END t::end_kbd),
            ("LI",              WATTRS t::start_li,             END t::end_li),
            ("LINK",            WATTRS t::tag_link,             EMPTY),
            ("MAP",             WATTRS t::start_map,            END t::end_map),
            ("MENU",            WATTRS t::start_menu,           END t::end_menu),
            ("META",            WATTRS t::tag_meta,             EMPTY),
            ("OL",              WATTRS t::start_ol,             END t::end_ol),
            ("OPTION",          WATTRS t::start_option,         END t::end_option),
            ("P",               WATTRS t::start_p,              END t::end_p),
            ("PARAM",           WATTRS t::tag_param,            EMPTY),
            ("PRE",             WATTRS t::start_pre,            END t::end_pre),
            ("SAMP",            WOATTRS t::start_samp,          END t::end_samp),
            ("SCRIPT",          WOATTRS t::start_script,        END t::end_script),
            ("SELECT",          WATTRS t::start_select,         END t::end_select),
            ("SMALL",           WOATTRS t::start_small,         END t::end_small),
            ("STRIKE",          WOATTRS t::start_strike,        END t::end_strike),
            ("STRONG",          WOATTRS t::start_strong,        END t::end_strong),
            ("STYLE",           WOATTRS t::start_style,         END t::end_style),
            ("SUB",             WOATTRS t::start_sub,           END t::end_sub),
            ("SUP",             WOATTRS t::start_sup,           END t::end_sup),
            ("TABLE",           WATTRS t::start_table,          END t::end_table),
            ("TD",              WATTRS t::start_td,             END t::end_td),
            ("TEXTAREA",        WATTRS t::start_textarea,       END t::end_textarea),
            ("TH",              WATTRS t::start_th,             END t::end_th),
            ("TITLE",           WOATTRS t::start_title,         END t::end_title),
            ("TR",              WATTRS t::start_tr,             END t::end_tr),
            ("TT",              WOATTRS t::start_tt,            END t::end_tt),
            ("U",               WOATTRS t::start_u,             END t::end_u),
            ("UL",              WATTRS t::start_ul,             END t::end_ul),
            ("VAR",             WOATTRS t::start_var,           END t::end_var)
          ];

    package sht
        =
        typelocked_hashtable_g (
            #
            Hash_Key = String;

            hash_value = hash_string::hash_string;

            same_key = ((==) : ((String, String)) -> Bool);
        );

    elem_table
        =
        table
        where
            table = sht::make_hashtable  { size_hint => length token_data,   not_found_exception => DIE "HTMLElements" };

            fun ins (tag, start_tok, end_tok)
                =
                sht::set
                    table
                    (tag, { start_t=>start_tok, end_t=>end_tok } );

            list::apply ins token_data;
        end;

    package ss = substring;     # substring     is from   src/lib/std/substring.pkg

    fun canonical_name name
        =
        ss::translate
            (string::from_char o char::to_upper)
            name;

    fun find name
        =
        (sht::find elem_table (canonical_name name));

    skip_ws = ss::drop_prefix char::is_space;

    fun scan_string (ctx, quote_char, ss)
        =
        {   my  (str, rest)
                =
                ss::split_off_prefix
                    {. #c != quote_char; }
                    ss;

            if   (ss::is_empty rest)
                 err::lex_error ctx "missing close quote for string";
                 (a::STRING (ss::to_string str), rest);
            else
                 (a::STRING (ss::to_string str), ss::drop_first 1 rest);
            fi;
        };

    # Scan an attribute value from a substring, returning the value, and
    # the rest of the substring.  Attribute values have one of the following
    # forms:
    #   1) a name token (a sequence of letters, digits, periods, or hyphens).
    #   2) a string literal enclosed in ""
    #   3) a string literal enclosed in ''
    #
    fun scan_attribute_val (ctx, attribute_name, ss)
        =
        {   fun is_name_char ('.' | '-') =>  TRUE;
                is_name_char c           =>  char::is_alphanumeric  c;
            end;

            case (ss::getc ss)
                NULL            =>  (a::IMPLICIT, ss);
                THE('"',  rest) =>  scan_string (ctx, '"', rest);
                THE('\'', rest) =>  scan_string (ctx, '\'', rest);

                THE (c, _)
                    =>
                    {   # Unquoted attributes should be Names, but this is often not
                        # the case, so we terminate them on whitespace or ">".
                        #
                        not_name_char = REF FALSE;

                        fun is_attribute_char c
                            =
                            if   (char::is_space c   or   c == '>')
                                 FALSE;
                            else
                                 if (not (is_name_char c))   not_name_char := TRUE;   fi;
                                 TRUE;
                            fi;

                        my (value, rest)
                            =
                            ss::split_off_prefix is_attribute_char ss;

                        if  (ss::is_empty  value)

                            err::bad_attribute_val ctx (ss::to_string attribute_name, "");
                            (a::IMPLICIT, ss);
                        else
                            if *not_name_char
                                 err::unquoted_attribute_val ctx (ss::to_string attribute_name);
                                 (a::STRING (ss::to_string value), rest);
                            else (a::NAME   (ss::to_string value), rest);  fi;
                        fi;
                      };
            esac;
          };

    fun scan_start_tag (ctx, ss)
        =
        scan_attributes (rest, [])
        where

            my (name, rest)
                =
                ss::split_off_prefix (not o char::is_space) ss;

            fun scan_attributes (rest, attributes)
                =
                {
                    rest = skip_ws rest;

                    case (ss::getc rest)

                        NULL => (name, list::reverse attributes);

                        THE ('"', rest)
                            =>
                            {   err::lex_error ctx "bogus text in element";
                                scan_attributes (#2 (scan_string (ctx, '"', rest)), attributes);
                            };

                        THE ('\'', rest)
                            =>
                            {   err::lex_error ctx "bogus text in element";
                                scan_attributes (#2 (scan_string (ctx, '\'', rest)), attributes);
                            };

                        THE (c, rest')
                            =>
                            if (char::is_alpha  c)

                                my (a_name, rest)
                                    =
                                    ss::split_off_prefix
                                       char::is_alphanumeric
                                       rest;

                                rest = skip_ws rest;

                                case (ss::getc rest)

                                    THE ('=', rest)
                                        =>
                                        {   # Get the attribute value:
                                            #
                                            my (a_val, rest)
                                                =
                                                scan_attribute_val (ctx, a_name, skip_ws rest);

                                            scan_attributes (rest, (canonical_name a_name, a_val) ! attributes);
                                        };

                                    _   =>
                                        scan_attributes (rest,
                                           (canonical_name a_name, a::IMPLICIT) ! attributes);
                                esac;

                            else
                                err::lex_error ctx "bogus character in element";
                                scan_attributes (rest', attributes);
                            fi;
                    esac;
                };
        end;

    fun start_tag file (tag, p1, p2)
        =
        {
            ctx  = { file, line=>p1 };
            tag' = ss::drop_first 1 (ss::drop_last 1 (ss::from_string tag));

            my (name, attributes)
                =
                scan_start_tag (ctx, tag');

            case (find name, attributes)

                 (NULL, _) => { err::bad_start_tag ctx (ss::to_string name); NULL; };

                 (THE { start_t=>WOATTRS _, ... }, _ ! _) => {
                    list::apply (err::unknown_attribute ctx o #1) attributes; NULL;};

                 (THE { start_t=>WOATTRS tag, ... }, []) =>
                    THE (tag (p1, p2));

                 (THE { start_t=>WATTRS tag, ... }, attributes) =>
                    THE (tag (attributes, p1, p2));
            esac;
          };

    fun end_tag file (tag, p1, p2)
        =
        {
            ctx = { file, line=>p1 };
            name = ss::drop_first 2 (ss::drop_last 1 (ss::from_string tag));

            case (find name)

                  NULL                           =>  { err::bad_end_tag ctx (ss::to_string name); NULL;};
                  THE { end_t=>EMPTY, ... }       =>  { err::bad_end_tag ctx (ss::to_string name); NULL;};
                  THE { end_t=>END end_tok, ... } =>  THE (end_tok (p1, p2));
            esac;
        };

};

## COPYRIGHT (c) 1996 AT&T REsearch.
## Subsequent changes by Jeff Prothero Copyright (c) 2010-2015,
## released per terms of SMLNJ-COPYRIGHT.

Comments and suggestions to: bugs@mythryl.org