GAP 4.8.9 installation with standard packages -- copy to your CoCalc project to get it
#############################################################################
##
#W XMLParser.gi GAPDoc Frank Lübeck
##
##
#Y Copyright (C) 2000, Frank Lübeck, Lehrstuhl D für Mathematik,
#Y RWTH Aachen
##
## The files XMLParser.g{d,i} contain a non-validating XML parser and some
## utilities.
##
BindGlobal("EMPTYCONTENT", 0);
BindGlobal("XMLPARSERFLAGS", rec());
BindGlobal("NAMECHARS",
## here ':' is missing since it will probably become reserved
## for name space syntax in future XML
Set(List(Concatenation([45,46], [48..57], [58], [65..90],
[95], [97..122]), CHAR_INT))
);
## two helper functions for parsing
# next successive characters not in delim (resp. enddelim) - default WHITESPACE
# arg: str, pos[, delim[, enddelim]]
BindGlobal("GetWord", function(arg)
local str, pos, delim, enddelim, len, pos2;
str := arg[1];
pos := arg[2];
if Length(arg)>2 then
delim := arg[3];
if Length(arg)>3 then
enddelim := arg[4];
else
enddelim := delim;
fi;
else
delim := WHITESPACE;
enddelim := WHITESPACE;
fi;
len := Length(str);
while pos <= len and str[pos] in delim do
pos := pos + 1;
od;
pos2 := pos;
while pos2 <= len and not str[pos2] in enddelim do
pos2 := pos2 + 1;
od;
if pos2>len then
return fail;
else
return [pos, pos2-1];
fi;
end);
# first position after pos with character outside chars
BindGlobal("GetChars", function(str, pos, chars)
local len;
len := Length(str);
while pos <= len and str[pos] in chars do
pos := pos + 1;
od;
if pos > len then
return fail;
else
return pos;
fi;
end);
# returns for string, position: [line number, [begin..end position of
# line]] (range without the '\n')
BindGlobal("LineNumberStringPosition", function(str, pos)
local p, nl, l;
p := 0;
l := 0;
nl := 0;
while p<>fail and p < pos do
l := p;
p := Position(str, '\n', p);
nl := nl+1;
od;
if p=fail then
p := Length(str)+1;
fi;
if pos = p then
nl := nl - 1;
fi;
return [nl, [l+1..p-1]];
end);
# printing of error message for non-well formed XML document,
# also shows some text around position of error.
XMLPARSEORIGINS := false;
BindGlobal("ParseError", function(str, pos, comment)
local Show, nl, ShowOrigin, r, badline, i, off;
# for examination of error
Show := function()
Pager(rec(lines := str, start := nl[1]));
end;
ShowOrigin := function()
if XMLPARSEORIGINS <> false then
Pager(rec(lines := StringFile(r[1]), start := r[2]));
else
Show();
fi;
end;
if InfoLevel(InfoXMLParser) > 0 then
if XMLPARSERFLAGS.Encoding <> "UTF-8" then
# we have an 8 bit encoding and must compute offset for original
# position
off := Number([1..pos], function(i) local c; c := INT_CHAR(str[i]);
return c > 127 and c < 192; end);
else
off := 0;
fi;
# this is in UTF-8 document
nl := LineNumberStringPosition(str, pos);
if XMLPARSEORIGINS <> false then
# need offset since in original encoding
r := OriginalPositionDocument(XMLPARSEORIGINS, pos-off);
fi;
Print("XML Parse Error: Line ", nl[1]);
Print(" Character ", pos-nl[2][1]+1, "\n");
if XMLPARSEORIGINS <> false then
Print("Original file: ", r[1], ", line number ", r[2],".\n");
fi;
badline := str{nl[2]};
# to be perfect, consider current TERM encoding, ignore for now
Print("-----------\n", badline, "\n");
# this uses the same non-' ' whitespace to get the '^' at the right position
for i in [1..pos-nl[2][1]] do
if not badline[i] in WHITESPACE then
badline[i] := ' ';
fi;
od;
Print(badline{[1..pos-nl[2][1]]});
Print("^", "\n-----------\n", comment, "\n!!! Type 'Show();' to watch the",
" input string in pager - starting with\n line containing error !!!\n");
if XMLPARSEORIGINS <> false then
Print("Or 'ShowOrigin();' to look it up in its source file.\n");
fi;
fi;
Error();
end);
## a container to collect named entities for the parser
BindGlobal("ENTITYDICT", rec());
## the default XML entities
BindGlobal("ENTITYDICT_default", rec(
lt := "&#60;",
gt := ">",
amp := "&#38;",
apos := "'",
quot := """) );
## the predefined entities of the GAPDoc package
## in LaTeX we use some saved boxes defined in our preamble for
## \, ~, ^, {, } because we think that the \texttt versions of these
## characters look better (than mathmode chars or accents without letter)
# (although this is a general XML parser, we make it convenient for
# GAPDoc documents)
BindGlobal("ENTITYDICT_GAPDoc", rec(
# compatibility entities, no longer needed by GAPDoc >= 1.0
tamp := "&",
tlt := "<",
tgt := ">",
hash := "#",
dollar := "$",
percent := "%",
tilde := "~",
bslash := "\\",
obrace := "{",
cbrace := "}",
uscore := "_",
circum := "^",
nbsp := " ",
copyright := "©",
ndash := "–",
GAP := "<Package>GAP</Package>",
GAPDoc := "<Package>GAPDoc</Package>",
TeX := "<Alt Only='LaTeX'>{\\TeX}</Alt><Alt Not='LaTeX'>TeX</Alt>",
LaTeX := "<Alt Only='LaTeX'>{\\LaTeX}</Alt><Alt Not='LaTeX'>LaTeX</Alt>",
BibTeX := "<Alt Only='LaTeX'>Bib{\\TeX}</Alt><Alt Not='LaTeX'>BibTeX</Alt>",
MeatAxe := "<Package>MeatAxe</Package>",
XGAP := "<Package>XGAP</Package>",
CC := "ℂ",
ZZ := "ℤ",
NN := "ℕ",
PP := "ℙ",
QQ := "ℚ",
HH := "ℍ",
RR := "ℝ",
)
);
## Parsing and resolving an entity, the needed substitution text for
## non-character entities must be bound in ENTITYDICT.
## -- assuming str[pos-1] = '&'
## -- returns pseudo-element (Char)EntityValue with content the result string
## -- character entities are just substituted and returned as string
## -- the replacement for other entities is reparsed for recursive
## substitution
InstallGlobalFunction(GetEnt, function(str, pos)
local d, i, ch, pos1, nam, doc, res, ent;
# character entity
if str[pos] = '#' then
d := "";
if str[pos+1] = 'x' then
i := pos + 2;
while str[i] <> ';' do
Add(d, str[i]);
i := i+1;
od;
d := NumberDigits(d, 16);
else
i := pos+1;
while str[i] <> ';' do
Add(d, str[i]);
i := i+1;
od;
d := NumberDigits(d, 10);
fi;
# must consider this as unicode, translate it to UTF-8
res := rec(name := "CharEntityValue", next := i+1,
content := Encode(Unicode([d]), "UTF-8"));
return res;
fi;
# else replace and reparse for recursive entity replacements
pos1 := Position(str, ';', pos-1);
if pos1=pos then
ParseError(str, pos, "empty entity name not allowed");
elif pos1 = fail then
ParseError(str, pos, "no semicolon in entity reference");
fi;
nam := str{[pos..pos1-1]};
if not IsBound(ENTITYDICT.(nam)) then
# XXX error or better going on here?
## ParseError(str, pos, "don't know entity name");
Info(InfoXMLParser, 1, "#W WARNING: Entity with name `", nam,
"' not known!\n#W", " (Specify in <!DOCTYPE ...> tag or ",
"in argument to parser!)\n");
doc := Concatenation("UNKNOWNEntity(", nam, ")");
else
doc := ENTITYDICT.(nam);
fi;
i := 1;
res := "";
while i <= Length(doc) do
if doc[i] <> '&' or (i<Length(doc) and doc[i+1] <> '#') then
Add(res, doc[i]);
i := i+1;
else
ent := GetEnt(doc, i+1);
Append(res, ent.content);
i := ent.next;
fi;
od;
return rec(name := "EntityValue", content := res, next := pos1+1);
end);
## reading a start tag including attribute values
# returns rec(name := elementname,
# attributes := rec( attributename1 := attributevalue1, ...)
# content := EMPTYCONTENT or [] (to be filled recursively)
# next := positon in string after start tag )
# Special handling of case pos=1: the element name is not parsed but assumed
# to be WHOLEDOCUMENT; this way a complete document can be put in one pseudo
# element of this name.
# assuming str[pos-1] = '<' and str[pos]<>'/'
InstallGlobalFunction(GetSTag, function(str, pos)
local res, pos2, start, attr, atval, delim, a, ent;
res := rec(attributes := rec());
# a small hack that allows to call GetElement with a whole document
# after appending "</WHOLEDOCUMENT>"
if pos=1 then
res.name := "WHOLEDOCUMENT";
res.next := 1;
res.content := [];
res.input := ShallowCopy(str);
return res;
fi;
# name of element
pos2 := GetChars(str, pos, NAMECHARS);
if pos2=fail then
ParseError(str, pos, "documents ends in element name");
fi;
if pos2=pos then
ParseError(str, pos, "tag must start with name \'<name ...\'");
fi;
res.name := str{[pos..pos2-1]};
# look for attributes or end of tag
pos := GetChars(str, pos2, WHITESPACE);
if pos=fail then
ParseError(str, pos2, "document ends in tag");
fi;
while not str[pos] in "/>" do
if not str[pos-1] in WHITESPACE then
ParseError(str, pos-1, Concatenation("there must be white space ",
"before attribute name"));
fi;
pos2 := GetChars(str, pos, NAMECHARS);
if pos2=fail then
ParseError(str, pos, "document ends in attribute name");
fi;
if pos2=pos then
ParseError(str, pos, "attribute must have non-empty name");
fi;
# reading attribute value
attr := str{[pos..pos2-1]};
## if not (str[pos2] = '=' and str[pos2+1] in "\"'") then
## ParseError(str, pos2, Concatenation("attribute must be specified ",
## "in form \'attr=\"text\"\'"));
## fi;
## delim := str[pos2+1];
# can be white space around =
pos2 := GetChars(str, pos2, WHITESPACE);
if pos2 = fail or str[pos2] <> '=' then
ParseError(str, pos2, "expecting '=' for attribute value");
fi;
pos2 := GetChars(str, pos2+1, WHITESPACE);
if pos2 = fail or not str[pos2] in "\"'" then
ParseError(str, pos2, "expecting quotes for attribute value");
fi;
delim := str[pos2];
atval := "";
pos2 := pos2 + 1;
while str[pos2] <> delim do
# we allow attr='fkjf"fafds' as well, see AnnStd 2.3
pos2 := GetWord(str, pos2, "", "<&\"'");
if pos2=fail then
ParseError(str, pos, "document ends in attribute value");
fi;
# must allow &xyz; for entity resolution as well
if not str[pos2[2]+1] = delim then
if str[pos2[2]+1] = '&' then
ent := GetEnt(str, pos2[2]+2);
Append(atval, str{[pos2[1]..pos2[2]]});
start := pos2[2]+2;
pos2 := ent.next;
if ent.name = "CharEntityValue" then
Append(atval, ent.content);
else
# now ent.content may still contain some character entities, but
# no '<' and so no markup
if '<' in ent.content then
ParseError(str, start,
"entity replacement in attribute value cannot contain '<'");
fi;
ent := GetElement(Concatenation(ent.content,"</WHOLEDOCUMENT>"),1);
if IsString(ent.content) then
Append(atval, ent.content);
else
for a in ent.content do
Append(atval, a.content);
od;
fi;
fi;
elif str[pos2[2]+1] in "\"'" then
Append(atval, str{[pos2[1]..pos2[2]+1]});
pos2 := pos2[2]+2;
else
ParseError(str, pos2[2]+1, "non valid character in attribute value");
fi;
else
Append(atval, str{[pos2[1]..pos2[2]]});
pos2 := pos2[2]+1;
fi;
od;
res.attributes.(attr) := atval;
pos2 := pos2+1;
pos := GetChars(str, pos2, WHITESPACE);
if pos=fail then
ParseError(str, pos2, "document ends in tag");
fi;
od;
if str[pos] = '/' then
res.content := EMPTYCONTENT;
pos := pos+1;
else
res.content := [];
fi;
if not str[pos] = '>' then
ParseError(str, pos, "expecting end of tag \'>\' here");
fi;
res.next := pos+1;
return res;
end);
## reading an end tag,
## returns rec( name := elementname,
## next := first position after this end tag)
# assuming str{[pos-2,pos-1]} = "</"
InstallGlobalFunction(GetETag, function(str, pos)
local res, pos2;
res := rec();
# name of element
pos2 := GetChars(str, pos, NAMECHARS);
if pos2=fail then
ParseError(str, pos, "documents ends in element name");
fi;
if pos2=pos then
ParseError(str, pos, "end tag must start with name \'</name ...\'");
fi;
res.name := str{[pos..pos2-1]};
pos := pos2;
pos2 := GetChars(str, pos, WHITESPACE);
if pos2=fail then
ParseError(str, pos, "documents ends inside end tag");
fi;
if str[pos2] <> '>' then
ParseError(str, pos2, "expecting end of tag \'>\' here");
fi;
res.next := pos2+1;
return res;
end);
## reading an element: start tag, content (with recursive calls of
## GetElement) and end tag
# returns record explained before GetSTag, but with .content component
# filled
# assuming str[pos-1] = '<' and str[pos] in NAMECHARS
# (in this function we read entity definitions inside a <!DOCTYPE declaration)
InstallGlobalFunction(GetElement, function(str, pos)
local res, r, s, pos2, lev, dt, p, nam, val, el, tmp;
res := GetSTag(str,pos);
res.start := pos - 1;
# case of empty element
if res.content = EMPTYCONTENT then
res.stop := res.next - 1;
return res;
fi;
pos := res.next;
while true do
if str[pos] = '&' then
# resolve entity
r := GetEnt(str, pos+1);
pos := r.next;
if r.name = "CharEntityValue" then
# consider as PCDATA
r.name := "PCDATA";
Add(res.content, r);
else
# we have to parse the result
s := Concatenation(r.content, "</WHOLEDOCUMENT>");
r := GetElement(s, 1);
Append(res.content, r.content);
fi;
elif str[pos] = '<' then
if str[pos+1] = '?' then
# processing instruction (PI), we repeat it literally
pos2 := PositionSublist(str, "?>", pos+2);
if pos2=fail then
ParseError(str, pos+2, "document ends within processing instruction");
fi;
tmp := str{[pos+2..pos2-1]};
Add(res.content, rec(name := "XMLPI", content := tmp));
# check for encoding information
if Length(tmp) > 3 and tmp{[1..4]} = "xml " then
tmp := Concatenation(tmp, "/>");
tmp := GetElement(tmp, 3);
if IsBound(tmp.attributes.encoding) then
tmp := tmp.attributes.encoding;
if UNICODE_RECODE.NormalizedEncoding(tmp) = fail then
Error("Cannot parse document in encoding ", tmp, "\n");
fi;
XMLPARSERFLAGS.Encoding := UNICODE_RECODE.NormalizedEncoding(tmp);
# if not in UTF-8 encoding we recode rest of the document now
if XMLPARSERFLAGS.Encoding <> "UTF-8" then
Info(InfoGAPDoc, 1, "#I recoding input from ",
XMLPARSERFLAGS.Encoding, " to UTF-8 . . .\n");
tmp := Encode(Unicode(str{[pos..Length(str)]},
XMLPARSERFLAGS.Encoding), "UTF-8");
str{[pos..pos-1+Length(tmp)]} := tmp;
fi;
fi;
fi;
pos := pos2+2;
elif str[pos+1] = '!' then
if str[pos+2] = '-' and str[pos+3] = '-' then
## comment
# here we ignore the restriction that inside comment
# no "--" is allowed.
pos2 := PositionSublist(str, "-->", pos+4);
if pos2=fail then
ParseError(str,pos+4, "document ends within comment");
fi;
Add(res.content, rec(name := "XMLCOMMENT",
content := str{[pos+4..pos2-1]}));
pos := pos2+3;
elif str[pos+2] = 'D' and str{[pos+3..pos+8]} = "OCTYPE" and
str[pos+9] in WHITESPACE then
## <!DOCTYPE ....
## end of this tag is matching ">"
## we have to read ENTITY declarations
pos2 := pos+10;
lev := 0;
while str[pos2] <> '>' or lev > 0 do
if str[pos2] = '<' then
lev := lev+1;
elif str[pos2] = '>' then
lev := lev-1;
fi;
pos2 := pos2+1;
if pos2>Length(str) then
ParseError(str,pos+10, "document ends within DOCTYPE tag");
fi;
od;
dt := rec(name := "XMLDOCTYPE",
content := str{[pos+10..pos2-1]});
## convenience for parsing GAPDoc document, here we add the
## GAPDoc defined entities automatically
pos := PositionSublist(dt.content, "gapdoc.dtd");
if pos <> fail and dt.content[pos-1] in "'\"/" then
for p in RecNames(ENTITYDICT_GAPDoc) do
ENTITYDICT.(p) := ENTITYDICT_GAPDoc.(p);
od;
fi;
Add(res.content, dt);
## parse entity declarations in here (no good error checking)
pos := PositionSublist(dt.content, "<!ENTITY");
while pos <> fail do
p := GetWord(dt.content, pos+8);
nam := dt.content{[p[1]..p[2]]};
# value enclosed in ".." or '..'
p := p[2]+1;
while dt.content[p] in WHITESPACE do
p := p + 1;
od;
p := [p+1];
Add(p, Position(dt.content, dt.content[p[1]-1], p[1])-1);
val := dt.content{[p[1]..p[2]]};
ENTITYDICT.(nam) := val;
pos := PositionSublist(dt.content, "<!ENTITY", p[2]);
od;
pos := pos2+1;
elif str[pos+2] = '[' and str{[pos+3..pos+8]} = "CDATA[" then
## <![CDATA[ everything is verbose text until "]]>"
pos2 := PositionSublist(str, "]]>", pos+9);
if pos2=fail then
ParseError(str,pos+10, "document ends within CDATA text");
fi;
if pos2>pos+9 then
Add(res.content, rec(name := "PCDATA",
content := str{[pos+9..pos2-1]}));
fi;
pos := pos2+3;
else
ParseError(str, pos, "unknown \"<!\"-tag");
fi;
elif str[pos+1] = '/' then
## end tag, must be the right one corresponding to the
## current element
el := GetETag(str, pos+2);
if res.name <> el.name then
ParseError(str, pos, Concatenation("wrong end tag, expecting \"</",
res.name, ">\" (starts line ",
String(LineNumberStringPosition(str, res.start)[1]), ")"));
else
res.stop := el.next - 1;
res.next := el.next;
break;
fi;
elif not str[pos+1] in NAMECHARS then
ParseError(str, pos+1, "not allowed character after '<'");
else
## a new element starts, call GetElement recursively
el := GetElement(str, pos+1);
Add(res.content, el);
pos := el.next;
fi;
else
pos2 := GetWord(str, pos, "", "<&");
if pos2 = fail then
ParseError(str, pos, "document ends before end of current element");
fi;
if pos2[2] >= pos then
Add(res.content, rec(name := "PCDATA",
content := str{[pos..pos2[2]]}));
fi;
pos := pos2[2]+1;
fi;
od;
return res;
end);
## the user function for parsing an XML document stored in a string,
## adds end tag for pseudo element WHOLEDOCUMENT (see before GetSTag)
## and calls GetElement
## <#GAPDoc Label="ParseTreeXMLString">
## <ManSection >
## <Func Arg="str[, srcinfo][, entitydict]" Name="ParseTreeXMLString" />
## <Func Arg="fname[, entitydict]" Name="ParseTreeXMLFile" />
## <Returns>a record which is root of a tree structure</Returns>
## <Description>
## The first function parses an XML-document stored in string <A>str</A>
## and returns the document in form of a tree.<P/>
##
## The optional argument <A>srcinfo</A> must have the same format
## as in <Ref Func="OriginalPositionDocument" />. If it is given then
## error messages refer to the original source of the text with the
## problem.<P/>
##
## With the optional argument <A>entitydict</A> named entities can be
## given to the parser, for example entities which are defined in the
## <C>.dtd</C>-file (which is not read by this parser). The standard
## XML-entities do not need to be provided, and for &GAPDoc; documents
## the entity definitions from <C>gapdoc.dtd</C> are automatically
## provided. Entities in the document's <C><!DOCTYPE</C> declaration
## are parsed and also need not to be provided here. The argument
## <A>entitydict</A> must be a record where each component name is an entity
## name (without the surrounding & and ;) to which is assigned its
## substitution string.<P/>
##
## The second function is just a shortcut for <C>ParseTreeXMLString(
## StringFile(</C><A>fname</A><C>), ... )</C>, see <Ref Func="StringFile"/>.
## <P/>
##
## After these functions return the list of named entities which were known
## during the parsing can be found in the record <C>ENTITYDICT</C>. <P/>
##
## A node in the result tree corresponds to an XML element, or to some
## parsed character data. In the first case it looks as follows:
##
## <Listing Type="Example Node">
## rec( name := "Book",
## attributes := rec( Name := "EDIM" ),
## content := [ ... list of nodes for content ...],
## start := 312,
## stop := 15610,
## next := 15611 )
## </Listing>
##
## This means that <C><A>str</A>{[312..15610]}</C> looks like
## <C><Book Name="EDIM"> ... content ... </Book></C>.<P/>
##
## The leaves of the tree encode parsed character data as in the
## following example:
##
## <Listing Type="Example Node">
## rec( name := "PCDATA",
## content := "text without markup " )
## </Listing>
##
## This function checks whether the XML document is <Emph>well
## formed</Emph>, see <Ref Chap="XMLvalid" /> for an explanation.
## If an error in the XML structure is found, a break loop is
## entered and the text around the position where the problem starts
## is shown. With <C>Show();</C> one can browse the original input
## in the <Ref BookName="Ref" Func="Pager" />, starting with the
## line where the error occurred.
##
## All entities are resolved when they are either entities defined
## in the &GAPDoc; package (in particular the standard XML entities)
## or if their definition is included in the <C><!DOCTYPE ..></C>
## tag of the document.<P/>
##
## Note that <Ref Func="ParseTreeXMLString" /> does not parse
## and interpret the corresponding document type definition (the
## <C>.dtd</C>-file given in the <C><!DOCTYPE ..></C> tag). Hence
## it also does not check the <Emph>validity</Emph> of the document
## (i.e., it is no <Emph>validating XML parser</Emph>).<P/>
##
## If you are using this function to parse a &GAPDoc; document
## you can use <Ref Func="CheckAndCleanGapDocTree" /> for some
## validation and additional checking of the document structure.
##
## </Description>
## </ManSection>
## <#/GAPDoc>
##
InstallGlobalFunction(ParseTreeXMLString, function(arg)
local str, ents, res, a;
# artificial end tag to wrap document in one element
str := Concatenation(arg[1], "</WHOLEDOCUMENT>");
# default encoding is UTF-8, may be changed if we find a <?xml ...
# processing instruction
XMLPARSERFLAGS.Encoding := "UTF-8";
if Length(arg) > 1 and IsList(arg[2]) then
XMLPARSEORIGINS := arg[2];
else
XMLPARSEORIGINS := false;
fi;
# reset ENTITYDICT
for a in RecNames(ENTITYDICT) do
Unbind(ENTITYDICT.(a));
od;
for a in RecNames(ENTITYDICT_default) do
ENTITYDICT.(a) := ENTITYDICT_default.(a);
od;
# maybe load more entities from last argument
if Length(arg) > 1 and IsRecord(arg[Length(arg)]) then
ents := arg[Length(arg)];
for a in RecNames(ents) do
ENTITYDICT.(a) := ents.(a);
od;
fi;
res := GetElement(str, 1);
res.input := ShallowCopy(arg[1]);
if XMLPARSEORIGINS <> false then
res.inputorigins := XMLPARSEORIGINS;
fi;
return res;
end);
InstallGlobalFunction(ParseTreeXMLFile, function(arg)
arg := ShallowCopy(arg);
arg[1] := StringFile(arg[1]);
return CallFuncList(ParseTreeXMLString, arg);
end);
## Print document tree structure (without the PCDATA entries)
## <#GAPDoc Label="DisplayXMLStructure">
## <ManSection >
## <Func Arg="tree" Name="DisplayXMLStructure" />
## <Description>
## This utility displays the tree structure of an XML document as it
## is returned by <Ref Func="ParseTreeXMLString" /> (without the
## <C>PCDATA</C> leaves).<P/>
##
## Since this is usually quite long the result is shown using the
## <Ref BookName="ref" Func="Pager" />.
## </Description>
## </ManSection>
## <#/GAPDoc>
##
InstallGlobalFunction(DisplayXMLStructure, function(doc)
local NL, prs, app, str;
str := "";
NL := "\n";
app := function(arg)
local i;
for i in [2..Length(arg)] do
Append(arg[1], arg[i]);
od;
end;
prs := function(doc, indent)
local a, c, indentnext;
if doc.name = "PCDATA" then
return;
fi;
if IsBound(doc.count) then
c := String(doc.count);
else
c := "";
fi;
app(str, indent, c, " ", doc.name, NL);
if IsBound(doc.attributes) then
for a in NamesOfComponents(doc.attributes) do
app(str, indent," #",a,":",doc.attributes.(a), NL);
od;
fi;
if doc.content = EMPTYCONTENT then
app(str, indent, " # empty element\n");
elif IsString(doc.content) then
## ??? too much output
## Print(indent, " # data\n");
else
for a in doc.content do
indentnext := Concatenation(indent, " ");
prs(a, indentnext);
od;
fi;
end;
prs(doc, "");
Page(str);
end);
## apply a function to all nodes of a parse tree
## <#GAPDoc Label="ApplyToNodesParseTree">
## <ManSection >
## <Func Arg="tree, fun" Name="ApplyToNodesParseTree" />
## <Func Arg="tree" Name="AddRootParseTree" />
## <Func Arg="tree" Name="RemoveRootParseTree" />
## <Description>
## The function <Ref Func="ApplyToNodesParseTree" /> applies a
## function <A>fun</A> to all nodes of the parse tree <A>tree</A>
## of an XML document returned by <Ref Func="ParseTreeXMLString"
## />.<P/>
##
## The function <Ref Func="AddRootParseTree" /> is an application of
## this. It adds to all nodes a component <C>.root</C> to which
## the top node tree <A>tree</A> is assigned. These components can be
## removed afterwards with <Ref Func="RemoveRootParseTree" />.
## </Description>
## </ManSection>
## <#/GAPDoc>
##
InstallGlobalFunction(ApplyToNodesParseTree, function(r, f)
local ff;
ff := function(rr)
local a;
if IsList(rr.content) and not IsString(rr.content) then
for a in rr.content do
f(a);
ff(a);
od;
fi;
end;
f(r);
ff(r);
end);
## This is useful for things like indexing where one should have
## access to the root of the document tree during the whole processing
InstallGlobalFunction(AddRootParseTree, function(r)
ApplyToNodesParseTree(r, function(a) a.root := r; end);
end);
## And this throws away the links
InstallGlobalFunction(RemoveRootParseTree, function(r)
ApplyToNodesParseTree(r, function(a) Unbind(a.root); end);
end);
## <#GAPDoc Label="StringXMLElement">
## <ManSection >
## <Func Arg="tree" Name="StringXMLElement" />
## <Returns>a list <C>[string, positions]</C></Returns>
## <Description>
##
## The argument <A>tree</A> must have a format of a node in the parse tree
## of an XML document as returned by <Ref Func="ParseTreeXMLString"/>
## (including the root node representing the full document). This function
## computes a pair <C>[string, positions]</C> where <C>string</C> contains
## XML code which is equivalent to the code which was parsed to get
## <A>tree</A>. And <C>positions</C> is a list of lists of four numbers
## <C>[eltb, elte, contb, conte]</C>. There is one such list for each XML
## element occuring in <C>string</C>, where <C>eltb</C> and <C>elte</C> are
## the begin and end position of this element in <C>string</C> and where
## <C>contb</C> and <C>conte</C> are begin and end position of the content
## of this element, or both are <C>0</C> if there is no content.<P/>
##
## Note that parsing XML code is an irreversible task, we can only expect
## to get equivalent XML code from this function. But parsing the resulting
## <C>string</C> again and applying <Ref Func="StringXMLElement"/> again
## gives the same result. See the function <Ref Func="EntitySubstitution"/>
## for back-substitutions of entities in the result.
## </Description>
## </ManSection>
## <#/GAPDoc>
# args: r[, count, pos, str] count, pos, str is for use within recursion
StringXMLElement := function(arg)
local r, str, pos, p, tmp, att, a;
if Length(arg) = 1 then
r := arg[1];
str := StringXMLElement(r, [], "");
# revert the WHOLEDOCUMENT trick of the parser
if IsRecord(r) and r.name = "WHOLEDOCUMENT" then
str[1] := str[1]{[16..Length(str[1])-16]};
str[2] := str[2] - 15;
str[2][Length(str[2])] := [1, Length(str[1]), 1, Length(str[1])];
for a in str[2] do
if a[3] = -15 then
a[3] := 0;
a[4] := 0;
fi;
od;
fi;
return str;
fi;
# now we are in the recursion
r := arg[1];
pos := arg[2];
str := arg[3];
if IsRecord(r) then
if r.name = "PCDATA" then
return StringXMLElement(r.content, pos, str);
elif r.name = "XMLPI" then
p := Length(str)+1;
Append(str, Concatenation("<?", r.content, "?>"));
Add(pos, [p, Length(str), 0, 0]);
return [str, pos];
elif r.name = "XMLDOCTYPE" then
p := Length(str)+1;
Append(str, Concatenation("<!DOCTYPE ", r.content, ">"));
Add(pos, [p, Length(str), 0, 0]);
return [str, pos];
elif r.name = "XMLCOMMENT" then
p := Length(str)+1;
Append(str, Concatenation("<!--", r.content, "-->"));
Add(pos, [p, Length(str), 0, 0]);
return [str, pos];
fi;
fi;
if IsString(r) then
r := SubstitutionSublist(r, "&", "&");
r := SubstitutionSublist(r, "<", "<");
if Length(r) > 0 then
Append(str, r);
fi;
return [str, pos];
fi;
p := [Length(str)+1];
Append(str, "<");
Append(str, r.name);
for att in RecNames(r.attributes) do
Add(str, ' ');
Append(str, att);
Append(str, "=\"");
tmp := SubstitutionSublist(r.attributes.(att), "\"", """);
tmp := SubstitutionSublist(tmp, "&", "&");
tmp := SubstitutionSublist(tmp, "<", "<");
if Length(tmp)>0 then
fi;
Append(str, tmp);
Append(str, "\"");
od;
if r.content = 0 then
Append(str, "/>");
Add(pos, [p[1], Length(str), 0, 0]);
return [str, pos];
fi;
Add(str, '>');
p[3] := Length(str)+1;
if IsString(r.content) then
StringXMLElement(r.content, pos, str);
else
for a in r.content do
StringXMLElement(a, pos, str);
od;
fi;
p[4] := Length(str);
Append(str, "</");
Append(str, r.name);
Add(str, '>');
p[2] := Length(str);
Add(pos, p);
return [str, pos];
end;
## <#GAPDoc Label="EntitySubstitution">
## <ManSection >
## <Func Arg="xmlstring, entities" Name="EntitySubstitution" />
## <Returns>a string</Returns>
## <Description>
## The argument <A>xmlstring</A> must be a string containing XML
## code or a pair <C>[string, positions]</C> as returned by <Ref
## Func="StringXMLElement"/>. The argument <A>entities</A> specifies entity
## names (without the surrounding <A>&</A> and <C>;</C>) and their
## substitution strings, either a list of pairs of strings or as a record
## with the names as components and the substitutions as values.<P/>
##
## This function tries to substitute non-intersecting parts of
## <C>string</C> by the given entities. If the <C>positions</C> information
## is given then only parts of the document which allow a valid
## substitution by an entity are considered. Otherwise a simple text
## substitution without further check is done. <P/>
##
## Note that in general the entity resolution in XML documents is a
## complicated and non-reversible task. But nevertheless this utility may
## be useful in not too complicated situations.
## </Description>
## </ManSection>
## <#/GAPDoc>
EntitySubstitution := function(xmlstr, entities)
local posinfo, entities2, check, subs, pos, npos, new, res, off, a;
if not IsString(xmlstr) then
posinfo := xmlstr[2];
xmlstr := xmlstr[1];
else
posinfo := fail;
fi;
if IsRecord(entities) then
entities := List(RecNames(entities), f-> [f, entities.(f)]);
fi;
# parse and rewrite entities
entities2 := List(entities, a-> [a[1], StringXMLElement(
ParseTreeXMLString(a[2]))[1]]);
# checks if beginning and end of a substring are in the content of the
# same element (if this information is available)
check := function(b, e)
local pb, a;
if posinfo = fail then
return true;
fi;
pb := [-1];
for a in posinfo do
if a[1] <= b and a[1] > pb[1] and a[2] >= e then
pb := a;
fi;
od;
if b = pb[1] and e = pb[2] then
return true;
fi;
for a in posinfo do
if a <> pb and a[1] > pb[1] and a[2] < pb[2] then
if not Intersection([b..e],[a[1]..a[2]]) in [[], [a[1]..a[2]]] then
return false;
fi;
fi;
od;
return true;
end;
subs := [];
for a in entities2 do
if not a[1] in ["lt", "gt", "amp", "apos", "quot"] then
pos := 0;
while pos <> fail do
npos := PositionSublist(xmlstr, a[2], pos);
if npos <> fail and check(npos, npos-1+Length(a[2])) then
new := [npos, npos-1+Length(a[2]), a];
if ForAll(subs, b-> b[1] > new[2] or b[2] < new[1]) then
Add(subs, new);
fi;
pos := new[2];
else
pos := npos;
fi;
od;
fi;
od;
Sort(subs);
if Length(subs) > 0 then
res := xmlstr{[1..subs[1][1]-1]};
off := 0;
res := "";
for a in subs do
Append(res, xmlstr{[off+1..a[1]-1]});
Append(res, Concatenation("&", a[3][1], ";"));
off := a[2];
od;
Append(res, xmlstr{[off+1..Length(xmlstr)]});
xmlstr := res;
fi;
return xmlstr;
end;
## <#GAPDoc Label="GetTextXMLTree">
## <ManSection >
## <Func Arg="tree" Name="GetTextXMLTree" />
## <Returns>a string</Returns>
## <Description>
## The argument <A>tree</A> must be a node of a parse tree of some
## XML document, see <Ref Func="ParseTreeXMLFile"/>.
## This function collects the content of this and all included elements
## recursively into a string.
## </Description>
## </ManSection>
## <#/GAPDoc>
# extract and collect text in elements recursively
InstallGlobalFunction(GetTextXMLTree, function(r)
local res, fun;
res := "";
fun := function(r)
if IsString(r.content) then
Append(res, r.content);
fi;
end;
ApplyToNodesParseTree(r, fun);
return res;
end);
## <#GAPDoc Label="XMLElements">
## <ManSection >
## <Func Arg="tree, eltnames" Name="XMLElements" />
## <Returns>a list of nodes</Returns>
## <Description>
## The argument <A>tree</A> must be a node of a parse tree of some
## XML document, see <Ref Func="ParseTreeXMLFile"/>.
## This function returns a list of all subnodes of <A>tree</A> (possibly
## including <A>tree</A>) of elements with name given in the list of strings
## <A>eltnames</A>. Use <C>"PCDATA"</C> as name for leave nodes which contain
## the actual text of the document. As an abbreviation <A>eltnames</A> can also
## be a string which is then put in a one element list.
## </Description>
## </ManSection>
## <#/GAPDoc>
# return list of nodes of elements with name in 'eltnames' from XML tree r
InstallGlobalFunction(XMLElements, function(r, eltnames)
local res, fun;
if IsString(eltnames) then
eltnames := [eltnames];
fi;
res := [];
fun := function(r)
if r.name in eltnames then
Add(res, r);
fi;
end;
ApplyToNodesParseTree(r, fun);
return res;
end);