On this page
parsexml
This module implements a simple high performance XML / HTML parser. The only encoding that is supported is UTF-8. The parser has been designed to be somewhat error correcting, so that even most "wild HTML" found on the web can be parsed with it. Note: This parser does not check that each <tag>
has a corresponding </tag>
! These checks have do be implemented by the client code for various reasons:
- Old HTML contains tags that have no end tag:
<br>
for example. - HTML tags are case insensitive, XML tags are case sensitive. Since this library can parse both, only the client knows which comparison is to be used.
- Thus the checks would have been very difficult to implement properly with little benefit, especially since they are simple to implement in the client. The client should use the
errorMsgExpected
proc to generate a nice error message that fits the other error messages this library creates.
Example 1: Retrieve HTML title
The file examples/htmltitle.nim
demonstrates how to use the XML parser to accomplish a simple task: To determine the title of an HTML document.
# Example program to show the parsexml module
# This program reads an HTML file and writes its title to stdout.
# Errors and whitespace are ignored.
import os, streams, parsexml, strutils
if paramCount() < 1:
quit("Usage: htmltitle filename[.html]")
var filename = addFileExt(paramStr(1), "html")
var s = newFileStream(filename, fmRead)
if s == nil: quit("cannot open the file " & filename)
var x: XmlParser
open(x, s, filename)
while true:
x.next()
case x.kind
of xmlElementStart:
if cmpIgnoreCase(x.elementName, "title") == 0:
var title = ""
x.next() # skip "<title>"
while x.kind == xmlCharData:
title.add(x.charData)
x.next()
if x.kind == xmlElementEnd and cmpIgnoreCase(x.elementName, "title") == 0:
echo("Title: " & title)
quit(0) # Success!
else:
echo(x.errorMsgExpected("/title"))
of xmlEof: break # end of file reached
else: discard # ignore other events
x.close()
quit("Could not determine title!")
Example 2: Retrieve all HTML links
The file examples/htmlrefs.nim
demonstrates how to use the XML parser to accomplish another simple task: To determine all the links an HTML document contains.
# Example program to show the new parsexml module
# This program reads an HTML file and writes all its used links to stdout.
# Errors and whitespace are ignored.
import os, streams, parsexml, strutils
proc `=?=` (a, b: string): bool =
# little trick: define our own comparator that ignores case
return cmpIgnoreCase(a, b) == 0
if paramCount() < 1:
quit("Usage: htmlrefs filename[.html]")
var links = 0 # count the number of links
var filename = addFileExt(paramStr(1), "html")
var s = newFileStream(filename, fmRead)
if s == nil: quit("cannot open the file " & filename)
var x: XmlParser
open(x, s, filename)
next(x) # get first event
block mainLoop:
while true:
case x.kind
of xmlElementOpen:
# the <a href = "xyz"> tag we are interested in always has an attribute,
# thus we search for ``xmlElementOpen`` and not for ``xmlElementStart``
if x.elementName =?= "a":
x.next()
if x.kind == xmlAttribute:
if x.attrKey =?= "href":
var link = x.attrValue
inc(links)
# skip until we have an ``xmlElementClose`` event
while true:
x.next()
case x.kind
of xmlEof: break mainLoop
of xmlElementClose: break
else: discard
x.next() # skip ``xmlElementClose``
# now we have the description for the ``a`` element
var desc = ""
while x.kind == xmlCharData:
desc.add(x.charData)
x.next()
echo(desc & ": " & link)
else:
x.next()
of xmlEof: break # end of file reached
of xmlError:
echo(errorMsg(x))
x.next()
else: x.next() # skip other events
echo($links & " link(s) found!")
x.close()
Imports
Types
-
XmlEventKind = enum xmlError, ## an error occurred during parsing xmlEof, ## end of file reached xmlCharData, ## character data xmlWhitespace, ## whitespace has been parsed xmlComment, ## a comment has been parsed xmlPI, ## processing instruction (``<?name something ?>``) xmlElementStart, ## ``<elem>`` xmlElementEnd, ## ``</elem>`` xmlElementOpen, ## ``<elem xmlAttribute, ## ``key = "value"`` pair xmlElementClose, ## ``>`` xmlCData, ## ``<![CDATA[`` ... data ... ``]]>`` xmlEntity, ## &entity; xmlSpecial ## ``<! ... data ... >``
- enumeration of all events that may occur when parsing Source Edit
-
XmlErrorKind = enum errNone, ## no error errEndOfCDataExpected, ## ``]]>`` expected errNameExpected, ## name expected errSemicolonExpected, ## ``;`` expected errQmGtExpected, ## ``?>`` expected errGtExpected, ## ``>`` expected errEqExpected, ## ``=`` expected errQuoteExpected, ## ``"`` or ``'`` expected errEndOfCommentExpected, ## ``-->`` expected errAttributeValueExpected ## non-empty attribute value expected
- enumeration that lists all errors that can occur Source Edit
-
XmlParseOption = enum reportWhitespace, ## report whitespace reportComments, ## report comments allowUnquotedAttribs, ## allow unquoted attribute values (for HTML) allowEmptyAttribs ## allow empty attributes (without explicit value)
- options for the XML parser Source Edit
-
XmlParser = object of BaseLexer a, b, c: string kind: XmlEventKind err: XmlErrorKind state: ParserState cIsEmpty: bool filename: string options: set[XmlParseOption]
- the parser object. Source Edit
Procs
-
proc open(my: var XmlParser; input: Stream; filename: string; options: set[XmlParseOption] = {}) {...}{.raises: [IOError, OSError], tags: [ReadIOEffect].}
-
initializes the parser with an input stream.
Filename
is only used for nice error messages. The parser's behaviour can be controlled by theoptions
parameter: Ifoptions
containsreportWhitespace
a whitespace token is reported as anxmlWhitespace
event. Ifoptions
containsreportComments
a comment token is reported as anxmlComment
event. Source Edit -
proc close(my: var XmlParser) {...}{.inline, raises: [Exception, IOError, OSError], tags: [WriteIOEffect].}
-
closes the parser
my
and its associated input stream. Source Edit -
proc kind(my: XmlParser): XmlEventKind {...}{.inline, raises: [], tags: [].}
- returns the current event type for the XML parser Source Edit
-
proc rawData(my: var XmlParser): lent string {...}{.inline, raises: [], tags: [].}
- returns the underlying 'data' string by reference. This is only used for speed hacks. Source Edit
-
proc rawData2(my: var XmlParser): lent string {...}{.inline, raises: [], tags: [].}
- returns the underlying second 'data' string by reference. This is only used for speed hacks. Source Edit
-
proc getColumn(my: XmlParser): int {...}{.inline, raises: [], tags: [].}
- get the current column the parser has arrived at. Source Edit
-
proc getLine(my: XmlParser): int {...}{.inline, raises: [], tags: [].}
- get the current line the parser has arrived at. Source Edit
-
proc getFilename(my: XmlParser): string {...}{.inline, raises: [], tags: [].}
- get the filename of the file that the parser processes. Source Edit
-
proc errorMsg(my: XmlParser): string {...}{.raises: [ValueError], tags: [].}
-
returns a helpful error message for the event
xmlError
Source Edit -
proc errorMsgExpected(my: XmlParser; tag: string): string {...}{. raises: [ValueError], tags: [].}
- returns an error message "<tag> expected" in the same format as the other error messages Source Edit
-
proc errorMsg(my: XmlParser; msg: string): string {...}{.raises: [ValueError], tags: [].}
-
returns an error message with text
msg
in the same format as the other error messages Source Edit -
proc next(my: var XmlParser) {...}{.raises: [IOError, OSError], tags: [ReadIOEffect].}
- retrieves the first/next event. This controls the parser. Source Edit
Templates
-
template charData(my: XmlParser): string
-
returns the character data for the events:
xmlCharData
,xmlWhitespace
,xmlComment
,xmlCData
,xmlSpecial
Raises an assertion in debug mode ifmy.kind
is not one of those events. In release mode, this will not trigger an error but the value returned will not be valid. Source Edit -
template elementName(my: XmlParser): string
-
returns the element name for the events:
xmlElementStart
,xmlElementEnd
,xmlElementOpen
Raises an assertion in debug mode ifmy.kind
is not one of those events. In release mode, this will not trigger an error but the value returned will not be valid. Source Edit -
template entityName(my: XmlParser): string
-
returns the entity name for the event:
xmlEntity
Raises an assertion in debug mode ifmy.kind
is notxmlEntity
. In release mode, this will not trigger an error but the value returned will not be valid. Source Edit -
template attrKey(my: XmlParser): string
-
returns the attribute key for the event
xmlAttribute
Raises an assertion in debug mode ifmy.kind
is notxmlAttribute
. In release mode, this will not trigger an error but the value returned will not be valid. Source Edit -
template attrValue(my: XmlParser): string
-
returns the attribute value for the event
xmlAttribute
Raises an assertion in debug mode ifmy.kind
is notxmlAttribute
. In release mode, this will not trigger an error but the value returned will not be valid. Source Edit -
template piName(my: XmlParser): string
-
returns the processing instruction name for the event
xmlPI
Raises an assertion in debug mode ifmy.kind
is notxmlPI
. In release mode, this will not trigger an error but the value returned will not be valid. Source Edit -
template piRest(my: XmlParser): string
-
returns the rest of the processing instruction for the event
xmlPI
Raises an assertion in debug mode ifmy.kind
is notxmlPI
. In release mode, this will not trigger an error but the value returned will not be valid. Source Edit
© 2006–2021 Andreas Rumpf
Licensed under the MIT License.
https://nim-lang.org/docs/parsexml.html