From 25555a8f425df601ab069509188d46e583816d83 Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Sun, 29 Jun 2014 20:19:31 -0500 Subject: [PATCH 1/8] Added in jinja parsing --- html5lib/constants.py | 8 +++- html5lib/html5parser.py | 41 +++++++++++++++++++ html5lib/tokenizer.py | 91 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) diff --git a/html5lib/constants.py b/html5lib/constants.py index e7089846..fb24a32f 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -3085,7 +3085,13 @@ "EndTag": 4, "EmptyTag": 5, "Comment": 6, - "ParseError": 7 + "ParseError": 7, + "JinjaStatementStartTag": 8, + "JinjaStatementEndTag": 9, + "JinjaStatementTag": 10, + "JinjaVariableStartTag": 11, + "JinjaVariableEndTag": 12, + "JinjaVariableTag": 13 } tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 5b9ce7d7..6c441bfe 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -1,6 +1,8 @@ from __future__ import absolute_import, division, unicode_literals from six import with_metaclass +import logging + import types from . import inputstream @@ -20,6 +22,9 @@ from .constants import adjustForeignAttributes as adjustForeignAttributesMap +log = logging.getLogger(u"html5lib") + + def parse(doc, treebuilder="etree", encoding=None, namespaceHTMLElements=True): """Parse a string or file-like object into a tree""" @@ -161,6 +166,12 @@ def mainLoop(self): CommentToken = tokenTypes["Comment"] DoctypeToken = tokenTypes["Doctype"] ParseErrorToken = tokenTypes["ParseError"] + JinjaStatementStartTag = tokenTypes["JinjaStatementStartTag"] + JinjaStatementEndTag = tokenTypes["JinjaStatementEndTag"] + JinjaStatementTag = tokenTypes["JinjaStatementTag"] + JinjaVariableStartTag = tokenTypes["JinjaVariableStartTag"] + JinjaVariableEndTag = tokenTypes["JinjaVariableEndTag"] + JinjaVariableTag = tokenTypes["JinjaVariableTag"] for token in self.normalizedTokens(): new_token = token @@ -202,6 +213,18 @@ def mainLoop(self): new_token = phase.processComment(new_token) elif type == DoctypeToken: new_token = phase.processDoctype(new_token) + elif type == JinjaStatementStartTag: + new_token = phase.processJinjaStatementStartTag(new_token) + elif type == JinjaStatementEndTag: + new_token = phase.processJinjaStatementEndTag(new_token) + elif type == JinjaStatementTag: + new_token = phase.processJinjaStatementTag(new_token) + elif type == JinjaVariableStartTag: + new_token = phase.processJinjaVariableStartTag(new_token) + elif type == JinjaVariableEndTag: + new_token = phase.processJinjaVariableEndTag(new_token) + elif type == JinjaVariableTag: + new_token = phase.processJinjaVariableTag(new_token) if (type == StartTagToken and token["selfClosing"] and not token["selfClosingAcknowledged"]): @@ -475,6 +498,24 @@ def processCharacters(self, token): def processSpaceCharacters(self, token): self.tree.insertText(token["data"]) + def processJinjaStatementStartTag(self, token): + pass + + def processJinjaStatementEndTag(self, token): + pass + + def processJinjaStatementTag(self, token): + pass + + def processJinjaVariableStartTag(self, token): + pass + + def processJinjaVariableEndTag(self, token): + pass + + def processJinjaVariableTag(self, token): + pass + def processStartTag(self, token): return self.startTagHandler[token["name"]](token) diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 79774578..6498e7ba 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -5,6 +5,10 @@ except NameError: pass + +import sys +import logging + from collections import deque from .constants import spaceCharacters @@ -20,6 +24,8 @@ entitiesTrie = Trie(entities) +log = logging.getLogger(u"html5lib") + class HTMLTokenizer(object): """ This class takes care of tokenizing HTML. @@ -254,6 +260,8 @@ def dataState(self): self.state = self.entityDataState elif data == "<": self.state = self.tagOpenState + elif data == "{": + self.state = self.jinjaOpenState elif data == "\u0000": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "invalid-codepoint"}) @@ -282,6 +290,89 @@ def entityDataState(self): self.state = self.dataState return True + def jinjaOpenState(self): + data = self.stream.char() + + if data == "{": + self.tokenQueue.append({ + "type": tokenTypes["JinjaVariableStartTag"], + "name": "{{", "data": [], + "selfClosing": False + }) + + self.state = self.jinjaVariableState + elif data == "%": + self.tokenQueue.append({ + "type": tokenTypes["JinjaStatementStartTag"], + "name": "{%", "data": [], + "selfClosing": False + }) + + self.state = self.jinjaStatementState + + #self.state = self.dataState + return True + + def jinjaStatementEndState(self): + # We got a { + data = self.stream.char() + + if data == "}": + self.tokenQueue.append({ + "type": tokenTypes["JinjaStatementEndTag"], + "name": "%}", "data": [], + "selfClosing": False + }) + self.state = self.dataState + + #self.state = self.dataState + return True + + def jinjaVariableEndState(self): + # We got a { + data = self.stream.char() + + if data == "}": + self.tokenQueue.append({ + "type": tokenTypes["JinjaVariableEndTag"], + "name": "}}", "data": [], + "selfClosing": False + }) + self.state = self.dataState + + #self.state = self.dataState + return True + + def jinjaStatementState(self): + data = self.stream.char() + + if data == "%": + self.state = self.jinjaStatementEndState + elif data is EOF: + # Tokenization ends. + return False + else: + chars = self.stream.charsUntil(("%", "\u0000")) + self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": + data + chars}) + + return True + + def jinjaVariableState(self): + data = self.stream.char() + + if data == "}": + self.state = self.jinjaVariableEndState + elif data is EOF: + # Tokenization ends. + return False + else: + chars = self.stream.charsUntil(("}", "\u0000")) + self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": + data + chars}) + + return True + def rcdataState(self): data = self.stream.char() if data == "&": From fdde76416be476af83e10b18ac938dd6d431bfde Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Sun, 29 Jun 2014 20:51:38 -0500 Subject: [PATCH 2/8] Syntax errors in jinja --- html5lib/html5parser.py | 9 +++++++++ html5lib/tokenizer.py | 32 ++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 6c441bfe..dbd7dd5b 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -176,6 +176,7 @@ def mainLoop(self): for token in self.normalizedTokens(): new_token = token while new_token is not None: + log.debug(u"Token {} Phase = {}".format(new_token, self.phase)) currentNode = self.tree.openElements[-1] if self.tree.openElements else None currentNodeNamespace = currentNode.namespace if currentNode else None currentNodeName = currentNode.name if currentNode else None @@ -421,6 +422,7 @@ def resetInsertionMode(self): new_phase = self.phases["inBody"] break + log.debug(u"Changing phase to {}".format(new_phase)) self.phase = new_phase def parseRCDataRawtext(self, token, contentType): @@ -438,6 +440,7 @@ def parseRCDataRawtext(self, token, contentType): self.originalPhase = self.phase + log.debug(u"Changing phase to text") self.phase = self.phases["text"] @@ -825,6 +828,8 @@ def startTagOther(self, token): def endTagHead(self, token): node = self.parser.tree.openElements.pop() assert node.name == "head", "Expected head got %s" % node.name + log = logging.getLogger(u"html5lib") + log.debug(u"Switching phase to afterHead") self.parser.phase = self.parser.phases["afterHead"] def endTagHtmlBodyBr(self, token): @@ -835,6 +840,8 @@ def endTagOther(self, token): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def anythingElse(self): + log = logging.getLogger(u"html5lib") + log.debug(u"Implied end head tag") self.endTagHead(impliedTagToken("head")) # XXX If we implement a parser for which scripting is disabled we need to @@ -905,6 +912,8 @@ def endTagOther(self, token): def anythingElse(self): self.tree.insertElement(impliedTagToken("body", "StartTag")) + log = logging.getLogger(u"html5lib") + log.debug(u"Changing phase to body") self.parser.phase = self.parser.phases["inBody"] self.parser.framesetOK = True diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 6498e7ba..9d458f67 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -324,6 +324,17 @@ def jinjaStatementEndState(self): "selfClosing": False }) self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-statement-closing-tag-but-got-eof", + "datavars": {"data": data}}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-statement-closing-tag-but-got-char", + "datavars": {"data": data}}) + self.stream.unget(data) + self.state = self.bogusCommentState #self.state = self.dataState return True @@ -339,6 +350,17 @@ def jinjaVariableEndState(self): "selfClosing": False }) self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-variable-closing-tag-but-got-eof", + "datavars": {"data": data}}) + self.state = self.dataState + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-variable-closing-tag-but-got-char", + "datavars": {"data": data}}) + self.stream.unget(data) + self.state = self.bogusCommentState #self.state = self.dataState return True @@ -349,8 +371,9 @@ def jinjaStatementState(self): if data == "%": self.state = self.jinjaStatementEndState elif data is EOF: - # Tokenization ends. - return False + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-jinja-statement"}) + self.state = self.dataState else: chars = self.stream.charsUntil(("%", "\u0000")) self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": @@ -364,8 +387,9 @@ def jinjaVariableState(self): if data == "}": self.state = self.jinjaVariableEndState elif data is EOF: - # Tokenization ends. - return False + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "eof-in-jinja-variable"}) + self.state = self.dataState else: chars = self.stream.charsUntil(("}", "\u0000")) self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": From 53a0132c900575c45907a4082f3149b358641014 Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Sun, 29 Jun 2014 21:04:10 -0500 Subject: [PATCH 3/8] Readme --- README.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.rst b/README.rst index 9e0a0f74..d0b5ecf7 100644 --- a/README.rst +++ b/README.rst @@ -1,3 +1,8 @@ +Cratejoy html5lib +================= + +Cratejoy fork of html5lib adds syntax checking for jinja templates + html5lib ======== From 5246944d9f68e15567682604b1159d23fbe0e44e Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Thu, 3 Jul 2014 17:46:02 -0500 Subject: [PATCH 4/8] WIP - Jinja parsing --- html5lib/constants.py | 3 +- html5lib/html5parser.py | 55 ++++++++++++++++++++++++++++++---- html5lib/tokenizer.py | 44 ++++++++++++++++++++++----- html5lib/treebuilders/_base.py | 12 ++++++++ html5lib/treebuilders/etree.py | 5 ++++ 5 files changed, 105 insertions(+), 14 deletions(-) diff --git a/html5lib/constants.py b/html5lib/constants.py index fb24a32f..e9e8fab8 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -3091,7 +3091,8 @@ "JinjaStatementTag": 10, "JinjaVariableStartTag": 11, "JinjaVariableEndTag": 12, - "JinjaVariableTag": 13 + "JinjaVariable": 13, + "JinjaFilter": 14 } tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index dbd7dd5b..40e4dd7e 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -171,7 +171,8 @@ def mainLoop(self): JinjaStatementTag = tokenTypes["JinjaStatementTag"] JinjaVariableStartTag = tokenTypes["JinjaVariableStartTag"] JinjaVariableEndTag = tokenTypes["JinjaVariableEndTag"] - JinjaVariableTag = tokenTypes["JinjaVariableTag"] + JinjaVariable = tokenTypes["JinjaVariable"] + JinjaFilter = tokenTypes["JinjaFilter"] for token in self.normalizedTokens(): new_token = token @@ -187,7 +188,11 @@ def mainLoop(self): self.parseError(new_token["data"], new_token.get("datavars", {})) new_token = None else: - if (len(self.tree.openElements) == 0 or + if type in (JinjaVariableStartTag, JinjaVariableEndTag, JinjaVariable, JinjaFilter): + log.debug(u"Type is a jinja tag") + phase = self.phases["inJinjaVariable"] + elif ( + len(self.tree.openElements) == 0 or currentNodeNamespace == self.tree.defaultNamespace or (self.isMathMLTextIntegrationPoint(currentNode) and ((type == StartTagToken and @@ -224,8 +229,10 @@ def mainLoop(self): new_token = phase.processJinjaVariableStartTag(new_token) elif type == JinjaVariableEndTag: new_token = phase.processJinjaVariableEndTag(new_token) - elif type == JinjaVariableTag: - new_token = phase.processJinjaVariableTag(new_token) + elif type == JinjaVariable: + new_token = phase.processJinjaVariable(new_token) + elif type == JinjaFilter: + new_token = phase.processJinjaFilter(new_token) if (type == StartTagToken and token["selfClosing"] and not token["selfClosingAcknowledged"]): @@ -516,7 +523,10 @@ def processJinjaVariableStartTag(self, token): def processJinjaVariableEndTag(self, token): pass - def processJinjaVariableTag(self, token): + def processJinjaVariable(self, token): + pass + + def processJinjaFilterTag(self, token): pass def processStartTag(self, token): @@ -535,6 +545,40 @@ def startTagHtml(self, token): def processEndTag(self, token): return self.endTagHandler[token["name"]](token) + class InJinjaVariablePhase(Phase): + def processJinjaVariableStartTag(self, token): + log = logging.getLogger('html5lib') + log.debug(u"InJinja: Start Tag") + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + + def processJinjaVariableEndTag(self, token): + log = logging.getLogger('html5lib') + log.debug(u"InJinja: End Tag {}".format(token["name"])) + for node in self.tree.openElements[::-1]: + log.debug(u"InJinja: Open tag {} token {}".format(node, token)) + if node.name == token["name"]: + self.tree.generateImpliedEndTags(exclude=token["name"]) + log.debug(u"InJinja: Implied end tag {} {}".format(self.tree.openElements[-1].name, token["name"])) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + while self.tree.openElements.pop() != node: + pass + break + else: + if node.nameTuple in specialElements: + log.debug(u"Nametuple {} in {}".format(node.nameTuple, specialElements)) + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + break + + def processJinjaVariable(self, token): + element = self.tree.createElementWithoutNamespace(token) + self.tree.openElements[-1].appendChild(element) + + def processJinjaFilter(self, token): + element = self.tree.createElementWithoutNamespace(token) + self.tree.openElements[-1].appendChild(element) + class InitialPhase(Phase): def processSpaceCharacters(self, token): pass @@ -2739,6 +2783,7 @@ def processEndTag(self, token): "inHead": InHeadPhase, # XXX "inHeadNoscript": InHeadNoScriptPhase, "afterHead": AfterHeadPhase, + "inJinjaVariable": InJinjaVariablePhase, "inBody": InBodyPhase, "text": TextPhase, "inTable": InTablePhase, diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 9d458f67..4670d260 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -294,17 +294,21 @@ def jinjaOpenState(self): data = self.stream.char() if data == "{": - self.tokenQueue.append({ + self.currentToken = { "type": tokenTypes["JinjaVariableStartTag"], - "name": "{{", "data": [], + "name": u"jinjavariabletag", "data": {}, + "namespace": None, "selfClosing": False - }) + } + + self.tokenQueue.append(self.currentToken) self.state = self.jinjaVariableState elif data == "%": self.tokenQueue.append({ "type": tokenTypes["JinjaStatementStartTag"], - "name": "{%", "data": [], + "name": "{%", "data": {}, + "namespace": None, "selfClosing": False }) @@ -346,7 +350,7 @@ def jinjaVariableEndState(self): if data == "}": self.tokenQueue.append({ "type": tokenTypes["JinjaVariableEndTag"], - "name": "}}", "data": [], + "name": u"jinjavariabletag", "data": [], "selfClosing": False }) self.state = self.dataState @@ -390,10 +394,34 @@ def jinjaVariableState(self): self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-jinja-variable"}) self.state = self.dataState + elif data in spaceCharacters: + # Skip spaces + pass + elif data == "|": + pass + # If this is the first token after the variable start tag + elif self.currentToken['type'] == tokenTypes["JinjaVariableStartTag"]: + #log.debug(u"Got start tag {}".format(("|", "}", "\u0000") | spaceCharacters)) + + chars = self.stream.charsUntil(frozenset(("|", "}", "\u0000")) | spaceCharacters) + self.currentToken = {"type": tokenTypes["JinjaVariable"], + "name": "jinjavariable", "selfClosing": True, "data": { + "value": data + chars, + "position": self.stream.position(), + }} + self.tokenQueue.append(self.currentToken) else: - chars = self.stream.charsUntil(("}", "\u0000")) - self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": - data + chars}) + chars = self.stream.charsUntil(frozenset(("|", "}", "\u0000")) | spaceCharacters) + self.currentToken = {"type": tokenTypes["JinjaFilter"], + "name": "jinjafilter", "selfClosing": True, "data": { + "value": data + chars, + "position": self.stream.position(), + }} + self.tokenQueue.append(self.currentToken) + #else: + #chars = self.stream.charsUntil(("}", "\u0000")) + #self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": + #data + chars}) return True diff --git a/html5lib/treebuilders/_base.py b/html5lib/treebuilders/_base.py index 8b97cc11..6e5c2561 100644 --- a/html5lib/treebuilders/_base.py +++ b/html5lib/treebuilders/_base.py @@ -3,6 +3,11 @@ from ..constants import scopingElements, tableInsertModeElements, namespaces +import logging + +log = logging.getLogger("html5lib") + + # The scope markers are inserted when entering object elements, # marquees, table cells, and table captions, and are used to prevent formatting # from "leaking" into tables, object elements, and marquees. @@ -269,6 +274,13 @@ def createElement(self, token): element.attributes = token["data"] return element + def createElementWithoutNamespace(self, token): + """Create an element but don't insert it anywhere""" + name = token["name"] + element = self.elementClass(name) + element.attributes = token["data"] + return element + def _getInsertFromTable(self): return self._insertFromTable diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 2c8ed19f..03d51275 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -3,6 +3,8 @@ import re +import logging + from . import _base from .. import ihatexml from .. import constants @@ -11,6 +13,8 @@ tag_regexp = re.compile("{([^}]*)}(.*)") +log = logging.getLogger("html5lib") + def getETreeBuilder(ElementTreeImplementation, fullTree=False): ElementTree = ElementTreeImplementation @@ -59,6 +63,7 @@ def _getAttributes(self): return self._element.attrib def _setAttributes(self, attributes): + log.debug(u"Attributes {}".format(attributes)) # Delete existing attributes first # XXX - there may be a better way to do this... for key in list(self._element.attrib.keys()): From f756cab03476ddb0e128ab08477e9c3a1e8b94b0 Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Fri, 4 Jul 2014 09:06:48 -0500 Subject: [PATCH 5/8] WIP - Jinja parsing --- html5lib/constants.py | 3 ++- html5lib/html5parser.py | 14 ++++++++++++-- html5lib/tokenizer.py | 24 ++++++++++++++++-------- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/html5lib/constants.py b/html5lib/constants.py index e9e8fab8..5735d7b6 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -3092,7 +3092,8 @@ "JinjaVariableStartTag": 11, "JinjaVariableEndTag": 12, "JinjaVariable": 13, - "JinjaFilter": 14 + "JinjaFilter": 14, + "JinjaPipe": 15 } tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 40e4dd7e..9d836e16 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -172,6 +172,7 @@ def mainLoop(self): JinjaVariableStartTag = tokenTypes["JinjaVariableStartTag"] JinjaVariableEndTag = tokenTypes["JinjaVariableEndTag"] JinjaVariable = tokenTypes["JinjaVariable"] + JinjaPipe = tokenTypes["JinjaPipe"] JinjaFilter = tokenTypes["JinjaFilter"] for token in self.normalizedTokens(): @@ -188,7 +189,7 @@ def mainLoop(self): self.parseError(new_token["data"], new_token.get("datavars", {})) new_token = None else: - if type in (JinjaVariableStartTag, JinjaVariableEndTag, JinjaVariable, JinjaFilter): + if type in (JinjaVariableStartTag, JinjaVariableEndTag, JinjaVariable, JinjaFilter, JinjaPipe): log.debug(u"Type is a jinja tag") phase = self.phases["inJinjaVariable"] elif ( @@ -231,6 +232,8 @@ def mainLoop(self): new_token = phase.processJinjaVariableEndTag(new_token) elif type == JinjaVariable: new_token = phase.processJinjaVariable(new_token) + elif type == JinjaPipe: + new_token = phase.processJinjaPipe(new_token) elif type == JinjaFilter: new_token = phase.processJinjaFilter(new_token) @@ -429,7 +432,7 @@ def resetInsertionMode(self): new_phase = self.phases["inBody"] break - log.debug(u"Changing phase to {}".format(new_phase)) + #log.debug(u"Changing phase to {}".format(new_phase)) self.phase = new_phase def parseRCDataRawtext(self, token, contentType): @@ -526,6 +529,9 @@ def processJinjaVariableEndTag(self, token): def processJinjaVariable(self, token): pass + def processJinjaPipe(self, token): + pass + def processJinjaFilterTag(self, token): pass @@ -575,6 +581,10 @@ def processJinjaVariable(self, token): element = self.tree.createElementWithoutNamespace(token) self.tree.openElements[-1].appendChild(element) + def processJinjaPipe(self, token): + element = self.tree.createElementWithoutNamespace(token) + self.tree.openElements[-1].appendChild(element) + def processJinjaFilter(self, token): element = self.tree.createElementWithoutNamespace(token) self.tree.openElements[-1].appendChild(element) diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 4670d260..09e705ff 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -390,6 +390,8 @@ def jinjaVariableState(self): if data == "}": self.state = self.jinjaVariableEndState + #elif data == "(": + #self.state = self.jinjaArgState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "eof-in-jinja-variable"}) @@ -398,30 +400,36 @@ def jinjaVariableState(self): # Skip spaces pass elif data == "|": - pass + self.currentToken = {"type": tokenTypes["JinjaPipe"], + "name": "jinjapipe", "selfClosing": True, "data": { + "value": data, + "position": self.stream.position(), + }} + self.tokenQueue.append(self.currentToken) # If this is the first token after the variable start tag elif self.currentToken['type'] == tokenTypes["JinjaVariableStartTag"]: #log.debug(u"Got start tag {}".format(("|", "}", "\u0000") | spaceCharacters)) - chars = self.stream.charsUntil(frozenset(("|", "}", "\u0000")) | spaceCharacters) + chars = self.stream.charsUntil(frozenset(("(", "|", "}", "\u0000")) | spaceCharacters) self.currentToken = {"type": tokenTypes["JinjaVariable"], "name": "jinjavariable", "selfClosing": True, "data": { "value": data + chars, "position": self.stream.position(), }} self.tokenQueue.append(self.currentToken) - else: - chars = self.stream.charsUntil(frozenset(("|", "}", "\u0000")) | spaceCharacters) + elif self.currentToken['type'] == tokenTypes["JinjaPipe"]: + chars = self.stream.charsUntil(frozenset(("(", "|", "}", "\u0000")) | spaceCharacters) self.currentToken = {"type": tokenTypes["JinjaFilter"], "name": "jinjafilter", "selfClosing": True, "data": { "value": data + chars, "position": self.stream.position(), }} self.tokenQueue.append(self.currentToken) - #else: - #chars = self.stream.charsUntil(("}", "\u0000")) - #self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": - #data + chars}) + else: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-pipe-got-character"}) + self.stream.unget(data) + self.state = self.bogusCommentState return True From 2e6aaf62f183f4891579f1c72dfea665f9fc2d01 Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Fri, 4 Jul 2014 09:37:58 -0500 Subject: [PATCH 6/8] Removed unecessary work around jinja, now we just toss stuff to make html parsing work --- html5lib/html5parser.py | 77 +----------------------- html5lib/tokenizer.py | 103 ++++++++++++--------------------- html5lib/treebuilders/etree.py | 1 - 3 files changed, 37 insertions(+), 144 deletions(-) diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 9d836e16..300b2737 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -166,19 +166,10 @@ def mainLoop(self): CommentToken = tokenTypes["Comment"] DoctypeToken = tokenTypes["Doctype"] ParseErrorToken = tokenTypes["ParseError"] - JinjaStatementStartTag = tokenTypes["JinjaStatementStartTag"] - JinjaStatementEndTag = tokenTypes["JinjaStatementEndTag"] - JinjaStatementTag = tokenTypes["JinjaStatementTag"] - JinjaVariableStartTag = tokenTypes["JinjaVariableStartTag"] - JinjaVariableEndTag = tokenTypes["JinjaVariableEndTag"] - JinjaVariable = tokenTypes["JinjaVariable"] - JinjaPipe = tokenTypes["JinjaPipe"] - JinjaFilter = tokenTypes["JinjaFilter"] for token in self.normalizedTokens(): new_token = token while new_token is not None: - log.debug(u"Token {} Phase = {}".format(new_token, self.phase)) currentNode = self.tree.openElements[-1] if self.tree.openElements else None currentNodeNamespace = currentNode.namespace if currentNode else None currentNodeName = currentNode.name if currentNode else None @@ -189,10 +180,7 @@ def mainLoop(self): self.parseError(new_token["data"], new_token.get("datavars", {})) new_token = None else: - if type in (JinjaVariableStartTag, JinjaVariableEndTag, JinjaVariable, JinjaFilter, JinjaPipe): - log.debug(u"Type is a jinja tag") - phase = self.phases["inJinjaVariable"] - elif ( + if ( len(self.tree.openElements) == 0 or currentNodeNamespace == self.tree.defaultNamespace or (self.isMathMLTextIntegrationPoint(currentNode) and @@ -220,22 +208,6 @@ def mainLoop(self): new_token = phase.processComment(new_token) elif type == DoctypeToken: new_token = phase.processDoctype(new_token) - elif type == JinjaStatementStartTag: - new_token = phase.processJinjaStatementStartTag(new_token) - elif type == JinjaStatementEndTag: - new_token = phase.processJinjaStatementEndTag(new_token) - elif type == JinjaStatementTag: - new_token = phase.processJinjaStatementTag(new_token) - elif type == JinjaVariableStartTag: - new_token = phase.processJinjaVariableStartTag(new_token) - elif type == JinjaVariableEndTag: - new_token = phase.processJinjaVariableEndTag(new_token) - elif type == JinjaVariable: - new_token = phase.processJinjaVariable(new_token) - elif type == JinjaPipe: - new_token = phase.processJinjaPipe(new_token) - elif type == JinjaFilter: - new_token = phase.processJinjaFilter(new_token) if (type == StartTagToken and token["selfClosing"] and not token["selfClosingAcknowledged"]): @@ -432,7 +404,6 @@ def resetInsertionMode(self): new_phase = self.phases["inBody"] break - #log.debug(u"Changing phase to {}".format(new_phase)) self.phase = new_phase def parseRCDataRawtext(self, token, contentType): @@ -450,7 +421,6 @@ def parseRCDataRawtext(self, token, contentType): self.originalPhase = self.phase - log.debug(u"Changing phase to text") self.phase = self.phases["text"] @@ -551,44 +521,6 @@ def startTagHtml(self, token): def processEndTag(self, token): return self.endTagHandler[token["name"]](token) - class InJinjaVariablePhase(Phase): - def processJinjaVariableStartTag(self, token): - log = logging.getLogger('html5lib') - log.debug(u"InJinja: Start Tag") - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(token) - - def processJinjaVariableEndTag(self, token): - log = logging.getLogger('html5lib') - log.debug(u"InJinja: End Tag {}".format(token["name"])) - for node in self.tree.openElements[::-1]: - log.debug(u"InJinja: Open tag {} token {}".format(node, token)) - if node.name == token["name"]: - self.tree.generateImpliedEndTags(exclude=token["name"]) - log.debug(u"InJinja: Implied end tag {} {}".format(self.tree.openElements[-1].name, token["name"])) - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - while self.tree.openElements.pop() != node: - pass - break - else: - if node.nameTuple in specialElements: - log.debug(u"Nametuple {} in {}".format(node.nameTuple, specialElements)) - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - break - - def processJinjaVariable(self, token): - element = self.tree.createElementWithoutNamespace(token) - self.tree.openElements[-1].appendChild(element) - - def processJinjaPipe(self, token): - element = self.tree.createElementWithoutNamespace(token) - self.tree.openElements[-1].appendChild(element) - - def processJinjaFilter(self, token): - element = self.tree.createElementWithoutNamespace(token) - self.tree.openElements[-1].appendChild(element) - class InitialPhase(Phase): def processSpaceCharacters(self, token): pass @@ -882,8 +814,6 @@ def startTagOther(self, token): def endTagHead(self, token): node = self.parser.tree.openElements.pop() assert node.name == "head", "Expected head got %s" % node.name - log = logging.getLogger(u"html5lib") - log.debug(u"Switching phase to afterHead") self.parser.phase = self.parser.phases["afterHead"] def endTagHtmlBodyBr(self, token): @@ -894,8 +824,6 @@ def endTagOther(self, token): self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) def anythingElse(self): - log = logging.getLogger(u"html5lib") - log.debug(u"Implied end head tag") self.endTagHead(impliedTagToken("head")) # XXX If we implement a parser for which scripting is disabled we need to @@ -966,8 +894,6 @@ def endTagOther(self, token): def anythingElse(self): self.tree.insertElement(impliedTagToken("body", "StartTag")) - log = logging.getLogger(u"html5lib") - log.debug(u"Changing phase to body") self.parser.phase = self.parser.phases["inBody"] self.parser.framesetOK = True @@ -2793,7 +2719,6 @@ def processEndTag(self, token): "inHead": InHeadPhase, # XXX "inHeadNoscript": InHeadNoScriptPhase, "afterHead": AfterHeadPhase, - "inJinjaVariable": InJinjaVariablePhase, "inBody": InBodyPhase, "text": TextPhase, "inTable": InTablePhase, diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 09e705ff..425c4d92 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -294,23 +294,21 @@ def jinjaOpenState(self): data = self.stream.char() if data == "{": - self.currentToken = { - "type": tokenTypes["JinjaVariableStartTag"], - "name": u"jinjavariabletag", "data": {}, - "namespace": None, - "selfClosing": False - } + #self.currentToken = { + #"type": tokenTypes["JinjaVariableStartTag"], + #"name": "{{", "data": {}, + #"selfClosing": False + #} - self.tokenQueue.append(self.currentToken) + #self.tokenQueue.append(self.currentToken) self.state = self.jinjaVariableState elif data == "%": - self.tokenQueue.append({ - "type": tokenTypes["JinjaStatementStartTag"], - "name": "{%", "data": {}, - "namespace": None, - "selfClosing": False - }) + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaStatementStartTag"], + #"name": "{%", "data": {}, + #"selfClosing": False + #}) self.state = self.jinjaStatementState @@ -322,11 +320,11 @@ def jinjaStatementEndState(self): data = self.stream.char() if data == "}": - self.tokenQueue.append({ - "type": tokenTypes["JinjaStatementEndTag"], - "name": "%}", "data": [], - "selfClosing": False - }) + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaStatementEndTag"], + #"name": "%}", "data": [], + #"selfClosing": False + #}) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": @@ -334,11 +332,7 @@ def jinjaStatementEndState(self): "datavars": {"data": data}}) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-jinja-statement-closing-tag-but-got-char", - "datavars": {"data": data}}) - self.stream.unget(data) - self.state = self.bogusCommentState + self.state = self.jinjaStatementState #self.state = self.dataState return True @@ -348,11 +342,11 @@ def jinjaVariableEndState(self): data = self.stream.char() if data == "}": - self.tokenQueue.append({ - "type": tokenTypes["JinjaVariableEndTag"], - "name": u"jinjavariabletag", "data": [], - "selfClosing": False - }) + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaVariableEndTag"], + #"name": "}}", "data": [], + #"selfClosing": False + #}) self.state = self.dataState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": @@ -360,11 +354,7 @@ def jinjaVariableEndState(self): "datavars": {"data": data}}) self.state = self.dataState else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-jinja-variable-closing-tag-but-got-char", - "datavars": {"data": data}}) - self.stream.unget(data) - self.state = self.bogusCommentState + self.state = self.jinjaStatementState #self.state = self.dataState return True @@ -376,12 +366,12 @@ def jinjaStatementState(self): self.state = self.jinjaStatementEndState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-jinja-statement"}) + "missing-jinja-closing-brace"}) self.state = self.dataState else: chars = self.stream.charsUntil(("%", "\u0000")) - self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": - data + chars}) + #self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": + #data + chars}) return True @@ -390,46 +380,25 @@ def jinjaVariableState(self): if data == "}": self.state = self.jinjaVariableEndState - #elif data == "(": - #self.state = self.jinjaArgState elif data is EOF: self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "eof-in-jinja-variable"}) + "missing-jinja-closing-brace"}) self.state = self.dataState elif data in spaceCharacters: # Skip spaces pass elif data == "|": - self.currentToken = {"type": tokenTypes["JinjaPipe"], - "name": "jinjapipe", "selfClosing": True, "data": { - "value": data, - "position": self.stream.position(), - }} - self.tokenQueue.append(self.currentToken) + pass # If this is the first token after the variable start tag - elif self.currentToken['type'] == tokenTypes["JinjaVariableStartTag"]: - #log.debug(u"Got start tag {}".format(("|", "}", "\u0000") | spaceCharacters)) - - chars = self.stream.charsUntil(frozenset(("(", "|", "}", "\u0000")) | spaceCharacters) - self.currentToken = {"type": tokenTypes["JinjaVariable"], - "name": "jinjavariable", "selfClosing": True, "data": { - "value": data + chars, - "position": self.stream.position(), - }} - self.tokenQueue.append(self.currentToken) - elif self.currentToken['type'] == tokenTypes["JinjaPipe"]: - chars = self.stream.charsUntil(frozenset(("(", "|", "}", "\u0000")) | spaceCharacters) - self.currentToken = {"type": tokenTypes["JinjaFilter"], - "name": "jinjafilter", "selfClosing": True, "data": { - "value": data + chars, - "position": self.stream.position(), - }} - self.tokenQueue.append(self.currentToken) else: - self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": - "expected-jinja-pipe-got-character"}) - self.stream.unget(data) - self.state = self.bogusCommentState + chars = self.stream.charsUntil(frozenset(("}", "\u0000")) | spaceCharacters) + #self.currentToken = {"type": tokenTypes["JinjaFilterTag"], "data": + #data + chars} + #self.tokenQueue.append(self.currentToken) + #else: + #chars = self.stream.charsUntil(("}", "\u0000")) + #self.tokenQueue.append({"type": tokenTypes["JinjaVariableTag"], "data": + #data + chars}) return True diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 03d51275..5d68fcd8 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -63,7 +63,6 @@ def _getAttributes(self): return self._element.attrib def _setAttributes(self, attributes): - log.debug(u"Attributes {}".format(attributes)) # Delete existing attributes first # XXX - there may be a better way to do this... for key in list(self._element.attrib.keys()): From 913aa1f2f544bb497010c1fbfc6394f275b94ce5 Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Sun, 28 Dec 2014 19:51:41 -0600 Subject: [PATCH 7/8] WIP --- html5lib/html5parser.py | 1 + html5lib/tests/test_jinja.py | 57 ++++++++++++++++++++++++++++++++++++ html5lib/tokenizer.py | 48 ++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 html5lib/tests/test_jinja.py diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 300b2737..91a5ae7b 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -168,6 +168,7 @@ def mainLoop(self): ParseErrorToken = tokenTypes["ParseError"] for token in self.normalizedTokens(): + #log.debug(u"Token {}".format(token)) new_token = token while new_token is not None: currentNode = self.tree.openElements[-1] if self.tree.openElements else None diff --git a/html5lib/tests/test_jinja.py b/html5lib/tests/test_jinja.py new file mode 100644 index 00000000..63dad144 --- /dev/null +++ b/html5lib/tests/test_jinja.py @@ -0,0 +1,57 @@ +import html5lib +import unittest +import logging + +log = logging.getLogger(__name__) + + +def dump(tree, tabs=0): + log.debug(u"{}Tag '{}' - {} children - Value = {} - Text = {}".format( + "".join(["\t" for i in range(tabs)]), tree.tag, len(tree), tree.attrib['value'] if 'value' in tree.attrib else None, tree.text)) + + for child in tree: + dump(child, tabs + 1) + + +class JinjaTestCase(unittest.TestCase): + def setUp(self): + self.parser = html5lib.HTMLParser(strict=True, namespaceHTMLElements=False, tree=html5lib.treebuilders.getTreeBuilder("etree", fullTree=True)) + + def assertTree(self, root, spec): + self.assertEqual(len(root), len(spec)) + + for child, spec_child in zip(root, spec): + self.assertEqual(child.tag, spec_child['tag']) + + if 'text' in spec_child: + self.assertEqual(child.text, spec_child['text']) + + if 'value' in spec_child: + self.assertEqual(child.attrib['value'], spec_child['value']) + + if 'children' in spec_child: + self.assertTree(child, spec_child['children']) + else: + self.assertEqual(len(child), 0) + + if 'attrs' in spec_child: + for k, v in spec_child['attrs'].iteritems(): + self.assertIn(k, child.attrib) + self.assertEqual(v, child.attrib[k]) + + def test_open_block(self): + html_string = """ + + """ + tree = self.parser.parseFragment(html_string) + dump(tree) diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 425c4d92..35a5f718 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -311,6 +311,14 @@ def jinjaOpenState(self): #}) self.state = self.jinjaStatementState + elif data == "#": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaStatementStartTag"], + #"name": "{%", "data": {}, + #"selfClosing": False + #}) + + self.state = self.jinjaCommentState #self.state = self.dataState return True @@ -359,6 +367,28 @@ def jinjaVariableEndState(self): #self.state = self.dataState return True + def jinjaCommentEndState(self): + # We got a { + data = self.stream.char() + + if data == "}": + #self.tokenQueue.append({ + #"type": tokenTypes["JinjaVariableEndTag"], + #"name": "}}", "data": [], + #"selfClosing": False + #}) + self.state = self.dataState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "expected-jinja-comment-closing-tag-but-got-eof", + "datavars": {"data": data}}) + self.state = self.dataState + else: + self.state = self.jinjaStatementState + + #self.state = self.dataState + return True + def jinjaStatementState(self): data = self.stream.char() @@ -375,6 +405,24 @@ def jinjaStatementState(self): return True + def jinjaCommentState(self): + data = self.stream.char() + + log.debug(u"Jinja comment state '{}'".format(data)) + + if data == "#": + self.state = self.jinjaCommentEndState + elif data is EOF: + self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": + "missing-jinja-comment-closing-brace"}) + self.state = self.dataState + else: + chars = self.stream.charsUntil(("#", "\u0000")) + #self.tokenQueue.append({"type": tokenTypes["JinjaStatementTag"], "data": + #data + chars}) + + return True + def jinjaVariableState(self): data = self.stream.char() From 92134ee97e3d1776a7f08f88063c6f6ce1e3a7d4 Mon Sep 17 00:00:00 2001 From: "aelaguiz@gmail.com" Date: Sun, 28 Dec 2014 20:04:12 -0600 Subject: [PATCH 8/8] WIP --- html5lib/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html5lib/tokenizer.py b/html5lib/tokenizer.py index 35a5f718..b1267cd5 100644 --- a/html5lib/tokenizer.py +++ b/html5lib/tokenizer.py @@ -408,7 +408,7 @@ def jinjaStatementState(self): def jinjaCommentState(self): data = self.stream.char() - log.debug(u"Jinja comment state '{}'".format(data)) + #log.debug(u"Jinja comment state '{}'".format(data)) if data == "#": self.state = self.jinjaCommentEndState pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy