diff --git a/README.rst b/README.rst index 9e0a0f74..d0b5ecf7 100644 --- a/README.rst +++ b/README.rst @@ -1,3 +1,8 @@ +Cratejoy html5lib +================= + +Cratejoy fork of html5lib adds syntax checking for jinja templates + html5lib ======== diff --git a/html5lib/constants.py b/html5lib/constants.py index e7089846..5735d7b6 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -3085,7 +3085,15 @@ "EndTag": 4, "EmptyTag": 5, "Comment": 6, - "ParseError": 7 + "ParseError": 7, + "JinjaStatementStartTag": 8, + "JinjaStatementEndTag": 9, + "JinjaStatementTag": 10, + "JinjaVariableStartTag": 11, + "JinjaVariableEndTag": 12, + "JinjaVariable": 13, + "JinjaFilter": 14, + "JinjaPipe": 15 } tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 5b9ce7d7..91a5ae7b 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -1,6 +1,8 @@ from __future__ import absolute_import, division, unicode_literals from six import with_metaclass +import logging + import types from . import inputstream @@ -20,6 +22,9 @@ from .constants import adjustForeignAttributes as adjustForeignAttributesMap +log = logging.getLogger(u"html5lib") + + def parse(doc, treebuilder="etree", encoding=None, namespaceHTMLElements=True): """Parse a string or file-like object into a tree""" @@ -163,6 +168,7 @@ def mainLoop(self): ParseErrorToken = tokenTypes["ParseError"] for token in self.normalizedTokens(): + #log.debug(u"Token {}".format(token)) new_token = token while new_token is not None: currentNode = self.tree.openElements[-1] if self.tree.openElements else None @@ -175,7 +181,8 @@ def mainLoop(self): self.parseError(new_token["data"], new_token.get("datavars", {})) new_token = None else: - if (len(self.tree.openElements) == 0 or + if ( + len(self.tree.openElements) == 0 or currentNodeNamespace == self.tree.defaultNamespace or (self.isMathMLTextIntegrationPoint(currentNode) and ((type == StartTagToken and @@ -475,6 +482,30 @@ def processCharacters(self, token): def processSpaceCharacters(self, token): self.tree.insertText(token["data"]) + def processJinjaStatementStartTag(self, token): + pass + + def processJinjaStatementEndTag(self, token): + pass + + def processJinjaStatementTag(self, token): + pass + + def processJinjaVariableStartTag(self, token): + pass + + def processJinjaVariableEndTag(self, token): + pass + + def processJinjaVariable(self, token): + pass + + def processJinjaPipe(self, token): + pass + + def processJinjaFilterTag(self, token): + pass + def processStartTag(self, token): return self.startTagHandler[token["name"]](token) diff --git a/html5lib/tests/test_jinja.py b/html5lib/tests/test_jinja.py new file mode 100644 index 00000000..63dad144 --- /dev/null +++ b/html5lib/tests/test_jinja.py @@ -0,0 +1,57 @@ +import html5lib +import unittest +import logging + +log = logging.getLogger(__name__) + + +def dump(tree, tabs=0): + log.debug(u"{}Tag '{}' - {} children - Value = {} - Text = {}".format( + "".join(["\t" for i in range(tabs)]), tree.tag, len(tree), tree.attrib['value'] if 'value' in tree.attrib else None, tree.text)) + + for child in tree: + dump(child, tabs + 1) + + +class JinjaTestCase(unittest.TestCase): + def setUp(self): + self.parser = html5lib.HTMLParser(strict=True, namespaceHTMLElements=False, tree=html5lib.treebuilders.getTreeBuilder("etree", fullTree=True)) + + def assertTree(self, root, spec): + self.assertEqual(len(root), len(spec)) + + for child, spec_child in zip(root, spec): + self.assertEqual(child.tag, spec_child['tag']) + + if 'text' in spec_child: + self.assertEqual(child.text, spec_child['text']) + + if 'value' in spec_child: + self.assertEqual(child.attrib['value'], spec_child['value']) + + if 'children' in spec_child: + self.assertTree(child, spec_child['children']) + else: + self.assertEqual(len(child), 0) + + if 'attrs' in spec_child: + for k, v in spec_child['attrs'].iteritems(): + self.assertIn(k, child.attrib) + self.assertEqual(v, child.attrib[k]) + + def test_open_block(self): + html_string = """ +
+ """ + tree = self.parser.parseFragment(html_string) + dump(tree) diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 79774578..b1267cd5 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -5,6 +5,10 @@ except NameError: pass + +import sys +import logging + from collections import deque from .constants import spaceCharacters @@ -20,6 +24,8 @@ entitiesTrie = Trie(entities) +log = logging.getLogger(u"html5lib") + class HTMLTokenizer(object): """ This class takes care of tokenizing HTML. @@ -254,6 +260,8 @@ def dataState(self): self.state = self.entityDataState elif data == "<": self.state = self.tagOpenState + elif data == "{": + self.state = self.jinjaOpenState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) @@ -282,6 +290,166 @@ def entityDataState(self): self.state = self.dataState return True + def jinjaOpenState(self): + data = self.stream.char() + + if data == "{": + #self.currentToken = { + #"type": tokenTypes["JinjaVariableStartTag"], + #"name": "{{", "data": {}, + #"selfClosing": False + #} + + #self.tokenQueue.append(self.currentToken) + + self.state = self.jinjaVariableState + elif data == "%": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaStatementStartTag"], + #"name": "{%", "data": {}, + #"selfClosing": False + #}) + + self.state = self.jinjaStatementState + elif data == "#": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaStatementStartTag"], + #"name": "{%", "data": {}, + #"selfClosing": False + #}) + + self.state = self.jinjaCommentState + + #self.state = self.dataState + return True + + def jinjaStatementEndState(self): + # We got a { + data = self.stream.char() + + if data == "}": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaStatementEndTag"], + #"name": "%}", "data": [], + #"selfClosing": False + #}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-statement-closing-tag-but-got-eof", + "datavars": {"data": data}}) + self.state = self.dataState + else: + self.state = self.jinjaStatementState + + #self.state = self.dataState + return True + + def jinjaVariableEndState(self): + # We got a { + data = self.stream.char() + + if data == "}": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaVariableEndTag"], + #"name": "}}", "data": [], + #"selfClosing": False + #}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-variable-closing-tag-but-got-eof", + "datavars": {"data": data}}) + self.state = self.dataState + else: + self.state = self.jinjaStatementState + + #self.state = self.dataState + return True + + def jinjaCommentEndState(self): + # We got a { + data = self.stream.char() + + if data == "}": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaVariableEndTag"], + #"name": "}}", "data": [], + #"selfClosing": False + #}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-comment-closing-tag-but-got-eof", + "datavars": {"data": data}}) + self.state = self.dataState + else: + self.state = self.jinjaStatementState + + #self.state = self.dataState + return True + + def jinjaStatementState(self): + data = self.stream.char() + + if data == "%": + self.state = self.jinjaStatementEndState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "missing-jinja-closing-brace"}) + self.state = self.dataState + else: + chars = self.stream.charsUntil(("%", "\u0000")) + #self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": + #data + chars}) + + return True + + def jinjaCommentState(self): + data = self.stream.char() + + #log.debug(u"Jinja comment state '{}'".format(data)) + + if data == "#": + self.state = self.jinjaCommentEndState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "missing-jinja-comment-closing-brace"}) + self.state = self.dataState + else: + chars = self.stream.charsUntil(("#", "\u0000")) + #self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": + #data + chars}) + + return True + + def jinjaVariableState(self): + data = self.stream.char() + + if data == "}": + self.state = self.jinjaVariableEndState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "missing-jinja-closing-brace"}) + self.state = self.dataState + elif data in spaceCharacters: + # Skip spaces + pass + elif data == "|": + pass + # If this is the first token after the variable start tag + else: + chars = self.stream.charsUntil(frozenset(("}", "\u0000")) | spaceCharacters) + #self.currentToken = {"type": tokenTypes["JinjaFilterTag"], "data": + #data + chars} + #self.tokenQueue.append(self.currentToken) + #else: + #chars = self.stream.charsUntil(("}", "\u0000")) + #self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": + #data + chars}) + + return True + def rcdataState(self): data = self.stream.char() if data == "&": diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/_base.py index 8b97cc11..6e5c2561 100644 --- a/html5lib/treebuilders/_base.py +++ b/html5lib/treebuilders/_base.py @@ -3,6 +3,11 @@ from ..constants import scopingElements, tableInsertModeElements, namespaces +import logging + +log = logging.getLogger("html5lib") + + # The scope markers are inserted when entering object elements, # marquees, table cells, and table captions, and are used to prevent formatting # from "leaking" into tables, object elements, and marquees. @@ -269,6 +274,13 @@ def createElement(self, token): element.attributes = token["data"] return element + def createElementWithoutNamespace(self, token): + """Create an element but don't insert it anywhere""" + name = token["name"] + element = self.elementClass(name) + element.attributes = token["data"] + return element + def _getInsertFromTable(self): return self._insertFromTable diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 2c8ed19f..5d68fcd8 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -3,6 +3,8 @@ import re +import logging + from . import _base from .. import ihatexml from .. import constants @@ -11,6 +13,8 @@ tag_regexp = re.compile("{([^}]*)}(.*)") +log = logging.getLogger("html5lib") + def getETreeBuilder(ElementTreeImplementation, fullTree=False): ElementTree = ElementTreeImplementationNote: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: