implement Lex; include "common.m"; # local copies from CU sys: Sys; CU: CharonUtils; S: String; T: StringIntTab; C: Ctype; J: Script; ctype: array of byte; EOF : con -2; EOB : con -1; tagnames = array[] of { " ", "!", "a", "abbr", "acronym", "address", "applet", "area", "b", "base", "basefont", "bdo", "big", "blink", "blockquote", "body", "bq", "br", "button", "caption", "center", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dir", "div", "dl", "dt", "em", "fieldset", "font", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "iframe", "image", "img", "input", "ins", "isindex", "kbd", "label", "legend", "li", "link", "map", "menu", "meta", "nobr", "noframes", "noscript", "object", "ol", "optgroup", "option", "p", "param", "pre", "q", "s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt", "u", "ul", "var", "xmp" }; tagtable : array of T->StringInt; # initialized from tagnames attrnames = array[] of { "abbr", "accept", "accept-charset", "accesskey", "action", "align", "alink", "alt", "archive", "axis", "background", "bgcolor", "border", "cellpadding", "cellspacing", "char", "charoff", "charset", "checked", "cite", "class", "classid", "clear", "code", "codebase", "codetype", "color", "cols", "colspan", "compact", "content", "coords", "data", "datafld", "dataformatas", "datapagesize", "datasrc", "datetime", "declare", "defer", "dir", "disabled", "enctype", "event", "face", "for", "frame", "frameborder", "headers", "height", "href", "hreflang", "hspace", "http-equiv", "id", "ismap", "label", "lang", "language", "link", "longdesc", "lowsrc", "marginheight", "marginwidth", "maxlength", "media", "method", "multiple", "name", "nohref", "noresize", "noshade", "nowrap", "object", "onabort", "onblur", "onchange", "onclick", "ondblclick", "onerror", "onfocus", "onkeydown", "onkeypress", "onkeyup", "onload", "onmousedown", "onmousemove", "onmouseout", "onmouseover", "onmouseup", "onreset", "onresize", "onselect", "onsubmit", "onunload", "profile", "prompt", "readonly", "rel", "rev", "rows", "rowspan", "rules", "scheme", "scope", "scrolling", "selected", "shape", "size", "span", "src", "standby", "start", "style", "summary", "tabindex", "target", "text", "title", "type", "usemap", "valign", "value", "valuetype", "version", "vlink", "vspace", "width" }; attrtable : array of T->StringInt; # initialized from attrnames chartab:= array[] of { T->StringInt ("AElig", 'Æ'), ("Aacute", 'Á'), ("Acirc", 'Â'), ("Agrave", 'À'), ("Alpha", 'Α'), ("Aring", 'Å'), ("Atilde", 'Ã'), ("Auml", 'Ä'), ("Beta", 'Β'), ("Ccedil", 'Ç'), ("Chi", 'Χ'), ("Dagger", '‡'), ("Delta", 'Δ'), ("ETH", 'Ð'), ("Eacute", 'É'), ("Ecirc", 'Ê'), ("Egrave", 'È'), ("Epsilon", 'Ε'), ("Eta", 'Η'), ("Euml", 'Ë'), ("Gamma", 'Γ'), ("Iacute", 'Í'), ("Icirc", 'Î'), ("Igrave", 'Ì'), ("Iota", 'Ι'), ("Iuml", 'Ï'), ("Kappa", 'Κ'), ("Lambda", 'Λ'), ("Mu", 'Μ'), ("Ntilde", 'Ñ'), ("Nu", 'Ν'), ("OElig", 'Œ'), ("Oacute", 'Ó'), ("Ocirc", 'Ô'), ("Ograve", 'Ò'), ("Omega", 'Ω'), ("Omicron", 'Ο'), ("Oslash", 'Ø'), ("Otilde", 'Õ'), ("Ouml", 'Ö'), ("Phi", 'Φ'), ("Pi", 'Π'), ("Prime", '″'), ("Psi", 'Ψ'), ("Rho", 'Ρ'), ("Scaron", 'Š'), ("Sigma", 'Σ'), ("THORN", 'Þ'), ("Tau", 'Τ'), ("Theta", 'Θ'), ("Uacute", 'Ú'), ("Ucirc", 'Û'), ("Ugrave", 'Ù'), ("Upsilon", 'Υ'), ("Uuml", 'Ü'), ("Xi", 'Ξ'), ("Yacute", 'Ý'), ("Yuml", 'Ÿ'), ("Zeta", 'Ζ'), ("aacute", 'á'), ("acirc", 'â'), ("acute", '´'), ("aelig", 'æ'), ("agrave", 'à'), ("alefsym", 'ℵ'), ("alpha", 'α'), ("amp", '&'), ("and", '∧'), ("ang", '∠'), ("aring", 'å'), ("asymp", '≈'), ("atilde", 'ã'), ("auml", 'ä'), ("bdquo", '„'), ("beta", 'β'), ("brvbar", '¦'), ("bull", '•'), ("cap", '∩'), ("ccedil", 'ç'), ("cdots", '⋯'), ("cedil", '¸'), ("cent", '¢'), ("chi", 'χ'), ("circ", 'ˆ'), ("clubs", '♣'), ("cong", '≅'), ("copy", '©'), ("crarr", '↵'), ("cup", '∪'), ("curren", '¤'), ("dArr", '⇓'), ("dagger", '†'), ("darr", '↓'), ("ddots", '⋱'), ("deg", '°'), ("delta", 'δ'), ("diams", '♦'), ("divide", '÷'), ("eacute", 'é'), ("ecirc", 'ê'), ("egrave", 'è'), ("emdash", '—'), ("empty", '∅'), ("emsp", ' '), ("endash", '–'), ("ensp", ' '), ("epsilon", 'ε'), ("equiv", '≡'), ("eta", 'η'), ("eth", 'ð'), ("euml", 'ë'), ("euro", '€'), ("exist", '∃'), ("fnof", 'ƒ'), ("forall", '∀'), ("frac12", '½'), ("frac14", '¼'), ("frac34", '¾'), ("frasl", '⁄'), ("gamma", 'γ'), ("ge", '≥'), ("gt", '>'), ("hArr", '⇔'), ("harr", '↔'), ("hearts", '♥'), ("hellip", '…'), ("iacute", 'í'), ("icirc", 'î'), ("iexcl", '¡'), ("igrave", 'ì'), ("image", 'ℑ'), ("infin", '∞'), ("int", '∫'), ("iota", 'ι'), ("iquest", '¿'), ("isin", '∈'), ("iuml", 'ï'), ("kappa", 'κ'), ("lArr", '⇐'), ("lambda", 'λ'), ("lang", '〈'), ("laquo", '«'), ("larr", '←'), ("lceil", '⌈'), ("ldots", '…'), ("ldquo", '“'), ("le", '≤'), ("lfloor", '⌊'), ("lowast", '∗'), ("loz", '◊'), ("lrm", ''), ("lsaquo", '‹'), ("lsquo", '‘'), ("lt", '<'), ("macr", '¯'), ("mdash", '—'), ("micro", 'µ'), ("middot", '·'), ("minus", '−'), ("mu", 'μ'), ("nabla", '∇'), ("nbsp", ' '), ("ndash", '–'), ("ne", '≠'), ("ni", '∋'), ("not", '¬'), ("notin", '∉'), ("nsub", '⊄'), ("ntilde", 'ñ'), ("nu", 'ν'), ("oacute", 'ó'), ("ocirc", 'ô'), ("oelig", 'œ'), ("ograve", 'ò'), ("oline", '‾'), ("omega", 'ω'), ("omicron", 'ο'), ("oplus", '⊕'), ("or", '∨'), ("ordf", 'ª'), ("ordm", 'º'), ("oslash", 'ø'), ("otilde", 'õ'), ("otimes", '⊗'), ("ouml", 'ö'), ("para", '¶'), ("part", '∂'), ("permil", '‰'), ("perp", '⊥'), ("phi", 'φ'), ("pi", 'π'), ("piv", 'ϖ'), ("plusmn", '±'), ("pound", '£'), ("prime", '′'), ("prod", '∏'), ("prop", '∝'), ("psi", 'ψ'), ("quad", ' '), ("quot", '"'), ("quot", '"'), ("rArr", '⇒'), ("radic", '√'), ("rang", '〉'), ("raquo", '»'), ("rarr", '→'), ("rceil", '⌉'), ("rdquo", '”'), ("real", 'ℜ'), ("reg", '®'), ("rfloor", '⌋'), ("rho", 'ρ'), ("rlm", ''), ("rsaquo", '›'), ("rsquo", '’'), ("sbquo", '‚'), ("scaron", 'š'), ("sdot", '⋅'), ("sect", '§'), ("shy", ''), ("sigma", 'σ'), ("sigmaf", 'ς'), ("sim", '∼'), ("sp", ' '), ("spades", '♠'), ("sub", '⊂'), ("sube", '⊆'), ("sum", '∑'), ("sup", '⊃'), ("sup1", '¹'), ("sup2", '²'), ("sup3", '³'), ("supe", '⊇'), ("szlig", 'ß'), ("tau", 'τ'), ("there4", '∴'), ("theta", 'θ'), ("thetasym", 'ϑ'), ("thinsp", ' '), ("thorn", 'þ'), ("tilde", '˜'), ("times", '×'), ("trade", '™'), ("uArr", '⇑'), ("uacute", 'ú'), ("uarr", '↑'), ("ucirc", 'û'), ("ugrave", 'ù'), ("uml", '¨'), ("upsih", 'ϒ'), ("upsilon", 'υ'), ("uuml", 'ü'), ("varepsilon", '∈'), ("varphi", 'ϕ'), ("varpi", 'ϖ'), ("varrho", 'ϱ'), ("vdots", '⋮'), ("vsigma", 'ς'), ("vtheta", 'ϑ'), ("weierp", '℘'), ("xi", 'ξ'), ("yacute", 'ý'), ("yen", '¥'), ("yuml", 'ÿ'), ("zeta", 'ζ'), ("zwj", ''), ("zwnj", ''), }; # Characters Winstart..Winend are those that Windows # uses interpolated into the Latin1 set. # They aren't supposed to appear in HTML, but they do.... Winstart : con 16r7f; Winend: con 16r9f; winchars := array[] of { '•', '•', '•', '‚', 'ƒ', '„', '…', '†', '‡', 'ˆ', '‰', 'Š', '‹', 'Œ', '•', '•', '•', '•', '‘', '’', '“', '”', '•', '–', '—', '˜', '™', 'š', '›', 'œ', '•', '•', 'Ÿ' }; NAMCHAR : con (C->L|C->U|C->D|C->N); LETTER : con (C->L|C->U); dbg := 0; warn := 0; init(cu: CharonUtils) { CU = cu; sys = load Sys Sys->PATH; S = load String String->PATH; C = cu->C; J = cu->J; T = load StringIntTab StringIntTab->PATH; tagtable = CU->makestrinttab(tagnames); attrtable = CU->makestrinttab(attrnames); ctype = C->ctype; } TokenSource.new(b: ref CU->ByteSource, chset : Btos, mtype: int) : ref TokenSource { ts := ref TSstate ( 0, # bi 0, # prevbi "", # s 0, # si Convcs->Startstate, # state Convcs->Startstate # prevstate ); ans := ref TokenSource( b, # b chset, # chset ts, # state mtype, # mtype 0 # inxmp ); dbg = int (CU->config).dbg['x']; warn = (int (CU->config).dbg['w']) || dbg; return ans; } TokenSource.gettoks(ts: self ref TokenSource): array of ref Token { ToksMax : con 500; # max chunk of tokens returned a := array[ToksMax] of ref Token; ai := 0; pcdai := 0; lim := 0; # put some dbg output in here if(ts.mtype == CU->TextHtml) { pcdstate : ref TSstate; gather: while(ai < ToksMax-1) { # always allow space for a Data token state := getstate(ts); c := getchar(ts); if(c < 0) break; tok : ref Token; if(c == '<') { tok = gettag(ts); if (tok != nil && ts.inxmp && tok.tag != Txmp+RBRA) { rewind(ts, state); getchar(ts); # consume the '<' tok = ref Token(Data, "<", nil); } if(tok != nil && tok.tag != Comment) { a[ai++] = tok; case (tok.tag) { Tselect or Ttitle or Toption=> # Several tags expect PCDATA after them. # Capture state so we can rewind if necessary pcdstate = state; pcdai = ai-1; Ttextarea => pcdstate = state; pcdai = ai-1; # not sure if we should parse entity references tok = gettagdata(ts, tok.tag, 1); if(tok != nil) { pcdstate = nil; a[ai++] = tok; } Tscript => pcdstate = state; pcdai = ai-1; # special rules for getting Data tok = getscriptdata(ts); if(tok != nil) { pcdstate = nil; a[ai++] = tok; } Txmp => pcdstate = nil; ts.inxmp = 1; Txmp+RBRA => pcdstate = nil; ts.inxmp = 0; Data => ; Tmeta => pcdstate = nil; break gather; * => pcdstate = nil; } } } else { tok = getdata(ts, c); if(tok != nil) a[ai++] = tok; } if(tok == nil && !eof(ts)) { # we need more input to complete the token lim = ts.state.bi; rewind(ts, state); break gather; } else if(dbg > 1) sys->print("lex: got token %s\n", tok.tostring()); } # Several tags expect PCDATA after them. # which means that build needs to see another tag or eof # after any data in order to know that PCDATA is ended. # Rewind if we haven't got to the following tag yet. if (pcdstate != nil && !eof(ts)) { rewind(ts, pcdstate); ai = pcdai; } } else { # plain text (non-html) tokens while(ai < ToksMax) { tok := getplaindata(ts); if(tok == nil) break; else a[ai++] = tok; if(dbg > 1) sys->print("lex: got token %s\n", tok.tostring()); } } if(dbg) sys->print("lex: returning %d tokens\n", ai); if (lim > ts.b.lim) ts.b.lim = lim; else ts.b.lim = ts.state.prevbi; if(ai == 0) return nil; return a[0:ai]; } # must not be called from within TokenSource.gettoks() # as it will not work with rewind() and ungetchar() # TokenSource.setchset(ts: self ref TokenSource, chset: Btos) { st := ts.state; nchars := st.si; if (nchars > 0 && nchars < len st.s) { # align bi to the current input char bs := ts.b; (state, nil, n) := ts.chset->btos(st.prevcsstate, bs.data[st.prevbi:st.bi], nchars); st.bi = st.prevbi + n; st.prevbi = st.bi; } ts.chset = chset; st.csstate = st.prevcsstate = Convcs->Startstate; st.s = nil; st.si = 0; } eof(ts : ref TokenSource) : int { st := ts.state; bs := ts.b; return (st.s == nil && bs.eof && st.prevbi == bs.edata); } # For case where source isn't HTML. # Just make data tokens, one per line (or partial line, # at end of buffer), ignoring non-whitespace control # characters and dumping \r's getplaindata(ts: ref TokenSource): ref Token { s := ""; j := 0; for(c := getchar(ts); c >= 0; c = getchar(ts)) { if(c < ' ') { if(ctype[c] == C->W) { if(c == '\r') { # ignore it unless no following '\n', # in which case treat it like '\n' c = getchar(ts); if(c != '\n') { if(c >= 0) ungetchar(ts); c = '\n'; } } } else c = 0; # ignore } if(c != 0) s[j++] = c; if(c == '\n') break; } if(s == "") return nil; return ref Token(Data, s, nil); } # Gather data up to next start-of-tag or end-of-buffer. # Translate entity references (&) if not in