Source code for html5lib.treewalkers.base

from __future__ import absolute_import, division, unicode_literals

from xml.dom import Node
from ..constants import namespaces, voidElements, spaceCharacters

__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
           "TreeWalker", "NonRecursiveTreeWalker"]

DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>"

spaceCharacters = "".join(spaceCharacters)


[docs]class TreeWalker(object): """Walks a tree yielding tokens Tokens are dicts that all have a ``type`` field specifying the type of the token. """
[docs] def __init__(self, tree): """Creates a TreeWalker :arg tree: the tree to walk """ self.tree = tree
def __iter__(self): raise NotImplementedError
[docs] def error(self, msg): """Generates an error token with the given message :arg msg: the error message :returns: SerializeError token """ return {"type": "SerializeError", "data": msg}
[docs] def emptyTag(self, namespace, name, attrs, hasChildren=False): """Generates an EmptyTag token :arg namespace: the namespace of the token--can be ``None`` :arg name: the name of the element :arg attrs: the attributes of the element as a dict :arg hasChildren: whether or not to yield a SerializationError because this tag shouldn't have children :returns: EmptyTag token """ yield {"type": "EmptyTag", "name": name, "namespace": namespace, "data": attrs} if hasChildren: yield self.error("Void element has children")
[docs] def startTag(self, namespace, name, attrs): """Generates a StartTag token :arg namespace: the namespace of the token--can be ``None`` :arg name: the name of the element :arg attrs: the attributes of the element as a dict :returns: StartTag token """ return {"type": "StartTag", "name": name, "namespace": namespace, "data": attrs}
[docs] def endTag(self, namespace, name): """Generates an EndTag token :arg namespace: the namespace of the token--can be ``None`` :arg name: the name of the element :returns: EndTag token """ return {"type": "EndTag", "name": name, "namespace": namespace}
[docs] def text(self, data): """Generates SpaceCharacters and Characters tokens Depending on what's in the data, this generates one or more ``SpaceCharacters`` and ``Characters`` tokens. For example: >>> from html5lib.treewalkers.base import TreeWalker >>> # Give it an empty tree just so it instantiates >>> walker = TreeWalker([]) >>> list(walker.text('')) [] >>> list(walker.text(' ')) [{u'data': ' ', u'type': u'SpaceCharacters'}] >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE [{u'data': ' ', u'type': u'SpaceCharacters'}, {u'data': u'abc', u'type': u'Characters'}, {u'data': u' ', u'type': u'SpaceCharacters'}] :arg data: the text data :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens """ data = data middle = data.lstrip(spaceCharacters) left = data[:len(data) - len(middle)] if left: yield {"type": "SpaceCharacters", "data": left} data = middle middle = data.rstrip(spaceCharacters) right = data[len(middle):] if middle: yield {"type": "Characters", "data": middle} if right: yield {"type": "SpaceCharacters", "data": right}
[docs] def comment(self, data): """Generates a Comment token :arg data: the comment :returns: Comment token """ return {"type": "Comment", "data": data}
[docs] def doctype(self, name, publicId=None, systemId=None): """Generates a Doctype token :arg name: :arg publicId: :arg systemId: :returns: the Doctype token """ return {"type": "Doctype", "name": name, "publicId": publicId, "systemId": systemId}
[docs] def entity(self, name): """Generates an Entity token :arg name: the entity name :returns: an Entity token """ return {"type": "Entity", "name": name}
[docs] def unknown(self, nodeType): """Handles unknown node types""" return self.error("Unknown node type: " + nodeType)
[docs]class NonRecursiveTreeWalker(TreeWalker): def getNodeDetails(self, node): raise NotImplementedError def getFirstChild(self, node): raise NotImplementedError def getNextSibling(self, node): raise NotImplementedError def getParentNode(self, node): raise NotImplementedError def __iter__(self): currentNode = self.tree while currentNode is not None: details = self.getNodeDetails(currentNode) type, details = details[0], details[1:] hasChildren = False if type == DOCTYPE: yield self.doctype(*details) elif type == TEXT: for token in self.text(*details): yield token elif type == ELEMENT: namespace, name, attributes, hasChildren = details if (not namespace or namespace == namespaces["html"]) and name in voidElements: for token in self.emptyTag(namespace, name, attributes, hasChildren): yield token hasChildren = False else: yield self.startTag(namespace, name, attributes) elif type == COMMENT: yield self.comment(details[0]) elif type == ENTITY: yield self.entity(details[0]) elif type == DOCUMENT: hasChildren = True else: yield self.unknown(details[0]) if hasChildren: firstChild = self.getFirstChild(currentNode) else: firstChild = None if firstChild is not None: currentNode = firstChild else: while currentNode is not None: details = self.getNodeDetails(currentNode) type, details = details[0], details[1:] if type == ELEMENT: namespace, name, attributes, hasChildren = details if (namespace and namespace != namespaces["html"]) or name not in voidElements: yield self.endTag(namespace, name) if self.tree is currentNode: currentNode = None break nextSibling = self.getNextSibling(currentNode) if nextSibling is not None: currentNode = nextSibling break else: currentNode = self.getParentNode(currentNode)