implement HTML;
include "sys.m";
include "html.m";
include "strinttab.m";
sys: Sys;
T: StringIntTab;
Stringtab: adt
{
name: string;
val: int;
};
chartab:= array[] of { T->StringInt
("AElig", 'Æ'),
("Aacute", 'Á'),
("Acirc", 'Â'),
("Agrave", 'À'),
("Aring", 'Å'),
("Atilde", 'Ã'),
("Auml", 'Ä'),
("Ccedil", 'Ç'),
("ETH", 'Ð'),
("Eacute", 'É'),
("Ecirc", 'Ê'),
("Egrave", 'È'),
("Euml", 'Ë'),
("Iacute", 'Í'),
("Icirc", 'Î'),
("Igrave", 'Ì'),
("Iuml", 'Ï'),
("Ntilde", 'Ñ'),
("Oacute", 'Ó'),
("Ocirc", 'Ô'),
("Ograve", 'Ò'),
("Oslash", 'Ø'),
("Otilde", 'Õ'),
("Ouml", 'Ö'),
("THORN", 'Þ'),
("Uacute", 'Ú'),
("Ucirc", 'Û'),
("Ugrave", 'Ù'),
("Uuml", 'Ü'),
("Yacute", 'Ý'),
("aacute", 'á'),
("acirc", 'â'),
("acute", '´'),
("aelig", 'æ'),
("agrave", 'à'),
("alpha", 'α'),
("amp", '&'),
("aring", 'å'),
("atilde", 'ã'),
("auml", 'ä'),
("beta", 'β'),
("brvbar", '¦'),
("ccedil", 'ç'),
("cdots", '⋯'),
("cedil", '¸'),
("cent", '¢'),
("chi", 'χ'),
("copy", '©'),
("curren", '¤'),
("ddots", '⋱'),
("deg", '°'),
("delta", 'δ'),
("divide", '÷'),
("eacute", 'é'),
("ecirc", 'ê'),
("egrave", 'è'),
("emdash", '—'),
("emsp", ' '),
("endash", '–'),
("ensp", ' '),
("epsilon", 'ε'),
("eta", 'η'),
("eth", 'ð'),
("euml", 'ë'),
("frac12", '½'),
("frac14", '¼'),
("frac34", '¾'),
("gamma", 'γ'),
("gt", '>'),
("iacute", 'í'),
("icirc", 'î'),
("iexcl", '¡'),
("igrave", 'ì'),
("iota", 'ι'),
("iquest", '¿'),
("iuml", 'ï'),
("kappa", 'κ'),
("lambda", 'λ'),
("laquo", '«'),
("ldots", '…'),
("lt", '<'),
("macr", '¯'),
("micro", 'µ'),
("middot", '·'),
("mu", 'μ'),
("nbsp", ' '),
("not", '¬'),
("ntilde", 'ñ'),
("nu", 'ν'),
("oacute", 'ó'),
("ocirc", 'ô'),
("ograve", 'ò'),
("omega", 'ω'),
("omicron", 'ο'),
("ordf", 'ª'),
("ordm", 'º'),
("oslash", 'ø'),
("otilde", 'õ'),
("ouml", 'ö'),
("para", '¶'),
("phi", 'φ'),
("pi", 'π'),
("plusmn", '±'),
("pound", '£'),
("psi", 'ψ'),
("quad", ' '),
("quot", '"'),
("raquo", '»'),
("reg", '®'),
("rho", 'ρ'),
("sect", '§'),
("shy", ''),
("sigma", 'σ'),
("sp", ' '),
("sup1", '¹'),
("sup2", '²'),
("sup3", '³'),
("szlig", 'ß'),
("tau", 'τ'),
("theta", 'θ'),
("thinsp", ' '),
("thorn", 'þ'),
("times", '×'),
("trade", '™'),
("uacute", 'ú'),
("ucirc", 'û'),
("ugrave", 'ù'),
("uml", '¨'),
("upsilon", 'υ'),
("uuml", 'ü'),
("varepsilon", '∈'),
("varphi", 'ϕ'),
("varpi", 'ϖ'),
("varrho", 'ϱ'),
("vdots", '⋮'),
("vsigma", 'ς'),
("vtheta", 'ϑ'),
("xi", 'ξ'),
("yacute", 'ý'),
("yen", '¥'),
("yuml", 'ÿ'),
("zeta", 'ζ'),
};
htmlstringtab := array[] of { T->StringInt
("a", Ta),
("address", Taddress),
("applet", Tapplet),
("area", Tarea),
("att_footer", Tatt_footer),
("b", Tb),
("base", Tbase),
("basefont", Tbasefont),
("big", Tbig),
("blink", Tblink),
("blockquote", Tblockquote),
("body", Tbody),
("bq", Tbq),
("br", Tbr),
("caption", Tcaption),
("center", Tcenter),
("cite", Tcite),
("code", Tcode),
("col", Tcol),
("colgroup", Tcolgroup),
("dd", Tdd),
("dfn", Tdfn),
("dir", Tdir),
("div", Tdiv),
("dl", Tdl),
("dt", Tdt),
("em", Tem),
("font", Tfont),
("form", Tform),
("frame", Tframe),
("frameset", Tframeset),
("h1", Th1),
("h2", Th2),
("h3", Th3),
("h4", Th4),
("h5", Th5),
("h6", Th6),
("head", Thead),
("hr", Thr),
("html", Thtml),
("i", Ti),
("img", Timg),
("input", Tinput),
("isindex", Tisindex),
("item", Titem),
("kbd", Tkbd),
("li", Tli),
("link", Tlink),
("map", Tmap),
("menu", Tmenu),
("meta", Tmeta),
("nobr", Tnobr),
("noframes", Tnoframes),
("ol", Tol),
("option", Toption),
("p", Tp),
("param", Tparam),
("pre", Tpre),
("q", Tq),
("samp", Tsamp),
("script", Tscript),
("select", Tselect),
("small", Tsmall),
("strike", Tstrike),
("strong", Tstrong),
("style", Tstyle),
("sub", Tsub),
("sup", Tsup),
("t", Tt),
("table", Ttable),
("tbody", Ttbody),
("td", Ttd),
("textarea", Ttextarea),
("textflow", Ttextflow),
("tfoot", Ttfoot),
("th", Tth),
("thead", Tthead),
("title", Ttitle),
("tr", Ttr),
("tt", Ttt),
("u", Tu),
("ul", Tul),
("var", Tvar)
};
W, D, L, U, N: con byte (1<D, '1'=>D, '2'=>D, '3'=>D, '4'=>D,
'5'=>D, '6'=>D, '7'=>D, '8'=>D, '9'=>D,
'A'=>U, 'B'=>U, 'C'=>U, 'D'=>U, 'E'=>U, 'F'=>U,
'G'=>U, 'H'=>U, 'I'=>U, 'J'=>U, 'K'=>U, 'L'=>U,
'M'=>U, 'N'=>U, 'O'=>U, 'P'=>U, 'Q'=>U, 'R'=>U,
'S'=>U, 'T'=>U, 'U'=>U, 'V'=>U, 'W'=>U, 'X'=>U,
'Y'=>U, 'Z'=>U,
'a'=>L, 'b'=>L, 'c'=>L, 'd'=>L, 'e'=>L, 'f'=>L,
'g'=>L, 'h'=>L, 'i'=>L, 'j'=>L, 'k'=>L, 'l'=>L,
'm'=>L, 'n'=>L, 'o'=>L, 'p'=>L, 'q'=>L, 'r'=>L,
's'=>L, 't'=>L, 'u'=>L, 'v'=>L, 'w'=>L, 'x'=>L,
'y'=>L, 'z'=>L,
'.'=>N, '-'=>N,
' '=>W, '\n'=>W, '\t'=>W, '\r'=>W,
* => byte 0
};
lex(b: array of byte, charset: int, keepwh: int): array of ref Lex
{
if(sys == nil)
sys = load Sys Sys->PATH;
if(T == nil)
T = load StringIntTab StringIntTab->PATH;
if(T == nil) {
sys->print("HTML->lex: couldn't %s\n", StringIntTab->PATH);
return nil;
}
a: array of ref Lex;
ai := 0;
i := 0;
nb := len b;
for(;;){
Whitespace:
for(;;){
# ignore nulls
while(i'){
i += 3;
continue Whitespace;
}
i++;
}
continue Whitespace;
}
break;
}
if(i == nb)
break;
if(ai == len a){
na := array[len a + 500] of ref Lex;
if(a != nil)
na[0:] = a;
a = na;
}
if(int b[i] == '<'){
lx : ref Lex;
(lx, i) = gettag(b, i, charset);
a[ai++] = lx;
}
else {
s: string;
(s, i) = getdata(b, i, keepwh, charset);
a[ai++] = ref Lex (Data, s, nil);
}
}
return a[0:ai];
}
getdata(b: array of byte, i: int, keepnls, charset: int): (string, int)
{
s:= "";
j:= 0;
c: int;
nb := len b;
loop:
while(i < nb){
oldi := i;
case charset{
Latin1 =>
c = int b[i++];
UTF8 =>
j: int;
(c, j, nil) = sys->byte2char(b, i);
i += j;
}
case c {
0 or 16r1a =>
continue loop;
'<' =>
i = oldi;
break loop;
'&' =>
(c, i) = ampersand(b, i);
'\n' =>
if(!keepnls)
c = ' ';
'\r' =>
if(oldi > 0 && int b[oldi-1] == '\n')
continue loop;
if(keepnls)
c = '\n';
else
c = ' ';
}
s[j++] = c;
}
return (s, i);
}
gettag(b: array of byte, i, charset: int): (ref Lex, int)
{
rbra := 0;
nb := len b;
ans := ref Lex(Notfound, "", nil);
al: list of Attr;
if(++i == nb)
return (ans, i);
istart := i;
c := int b[i];
if(c == '/') {
rbra = RBRA;
if(++i == nb)
return (ans, i);
c = int b[i];
}
if(c>=NCTYPE || !int (ctype[c]&(L|U))) {
while(i < nb) {
c = int b[i++];
if(c == '>')
break;
}
ans.text = string b[istart:i];
return (ans, i);
}
namstart := i;
while(clookup(htmlstringtab, name);
if(fnd)
ans.tag = tag+rbra;
else
ans.text = name;
attrloop:
while(i < nb){
# look for "ws name" or "ws name ws = ws val" (ws=whitespace)
# skip whitespace
while(c') {
i++;
break;
}
if(c == '<')
break; # error: unclosed tag
if(c>=NCTYPE || !int (ctype[c]&(L|U))) {
# error, not the start of a name
# skip to end of tag
while(i < nb) {
c = int b[i++];
if(c == '>')
break;
}
break attrloop;
}
# gather name
namstart = i;
while(c
c = int b[i++];
UTF8 =>
j: int;
(c, j, nil) = sys->byte2char(b, i);
i += j;
}
if(c == '>') {
if(quote) {
# c might be part of string (though not good style)
# but if line ends before close quote, assume
# there was an unmatched quote
for(k := i; k < nb; k++) {
c = int b[k];
if(c == quote) {
val[nv++] = '>';
continue valloop;
}
if(c == '\n') {
i--;
break valloop;
}
}
}
i--;
break valloop;
}
if(quote) {
if(c == quote)
break valloop;
if(c == '\n')
continue valloop;
if(c == '\t' || c == '\r')
c = ' ';
}
else {
if(c= nb)
return ('?', i);
fnd := 0;
ans := 0;
if(int b[i] == '#'){
i++;
while(ilookup(chartab, s);
}
if(!fnd)
return ('&', starti);
if(i
return 1;
Data =>
return 0;
}
}
return 0;
}
# for debugging
lex2string(l: ref Lex): string
{
ans := "";
tag := l.tag;
if(tag == HTML->Data)
ans = "'" + l.text + "'";
else {
ans = "<";
if(tag >= RBRA) {
tag -= RBRA;
ans = ans + "/";
}
tname := T->revlookup(htmlstringtab, tag);
if(tname != nil)
ans = ans + uppercase(tname);
for(al := l.attr; al != nil; al = tl al) {
a := hd al;
ans = ans + " " + a.name + "='" + a.value + "'";
}
ans = ans + ">";
}
return ans;
}