Skip to content

Commit bd57c61

Browse files
committed
Get Python tokenizer up to the version of the spec of the end of September.
1 parent 7d29315 commit bd57c61

File tree

1 file changed

+75
-5
lines changed

1 file changed

+75
-5
lines changed

src/html5lib/tokenizer.py

Lines changed: 75 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -955,7 +955,7 @@ def afterDoctypeNameState(self):
955955
matched = False
956956
break
957957
if matched:
958-
self.state = self.beforeDoctypePublicIdentifierState
958+
self.state = self.afterDoctypePublicKeywordState
959959
return True
960960
elif data in (u"s", u"S"):
961961
matched = True
@@ -966,7 +966,7 @@ def afterDoctypeNameState(self):
966966
matched = False
967967
break
968968
if matched:
969-
self.state = self.beforeDoctypeSystemIdentifierState
969+
self.state = self.afterDoctypeSystemKeywordState
970970
return True
971971

972972
# All the characters read before the current 'data' will be
@@ -981,6 +981,26 @@ def afterDoctypeNameState(self):
981981
self.state = self.bogusDoctypeState
982982

983983
return True
984+
985+
def afterDoctypePublicKeywordState(self):
986+
data = self.stream.char()
987+
if data in spaceCharacters:
988+
self.state = self.beforeDoctypePublicIdentifierState
989+
elif data in ("'", '"'):
990+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
991+
"unexpected-char-in-doctype"})
992+
self.stream.unget(data)
993+
self.state = self.beforeDoctypePublicIdentifierState
994+
elif data is EOF:
995+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
996+
"eof-in-doctype"})
997+
self.currentToken["correct"] = False
998+
self.tokenQueue.append(self.currentToken)
999+
self.state = self.dataState
1000+
else:
1001+
self.stream.unget(data)
1002+
self.state = self.beforeDoctypePublicIdentifierState
1003+
return True
9841004

9851005
def beforeDoctypePublicIdentifierState(self):
9861006
data = self.stream.char()
@@ -1054,17 +1074,47 @@ def doctypePublicIdentifierSingleQuotedState(self):
10541074
def afterDoctypePublicIdentifierState(self):
10551075
data = self.stream.char()
10561076
if data in spaceCharacters:
1057-
pass
1058-
elif data == "\"":
1077+
self.state = self.betweenDoctypePublicAndSystemIdentifiersState
1078+
elif data == ">":
1079+
self.tokenQueue.append(self.currentToken)
1080+
self.state = self.dataState
1081+
elif data == '"':
1082+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1083+
"unexpected-char-in-doctype"})
10591084
self.currentToken["systemId"] = u""
10601085
self.state = self.doctypeSystemIdentifierDoubleQuotedState
10611086
elif data == "'":
1087+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1088+
"unexpected-char-in-doctype"})
10621089
self.currentToken["systemId"] = u""
10631090
self.state = self.doctypeSystemIdentifierSingleQuotedState
1091+
elif data is EOF:
1092+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1093+
"eof-in-doctype"})
1094+
self.currentToken["correct"] = False
1095+
self.tokenQueue.append(self.currentToken)
1096+
self.state = self.dataState
1097+
else:
1098+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1099+
"unexpected-char-in-doctype"})
1100+
self.currentToken["correct"] = False
1101+
self.state = self.bogusDoctypeState
1102+
return True
1103+
1104+
def betweenDoctypePublicAndSystemIdentifiersState(self):
1105+
data = self.stream.char()
1106+
if data in spaceCharacters:
1107+
pass
10641108
elif data == ">":
10651109
self.tokenQueue.append(self.currentToken)
10661110
self.state = self.dataState
1067-
elif data is EOF:
1111+
elif data == '"':
1112+
self.currentToken["systemId"] = u""
1113+
self.state = self.doctypeSystemIdentifierDoubleQuotedState
1114+
elif data == "'":
1115+
self.currentToken["systemId"] = u""
1116+
self.state = self.doctypeSystemIdentifierSingleQuotedState
1117+
elif data == EOF:
10681118
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
10691119
"eof-in-doctype"})
10701120
self.currentToken["correct"] = False
@@ -1077,6 +1127,26 @@ def afterDoctypePublicIdentifierState(self):
10771127
self.state = self.bogusDoctypeState
10781128
return True
10791129

1130+
def afterDoctypeSystemKeywordState(self):
1131+
data = self.stream.char()
1132+
if data in spaceCharacters:
1133+
self.state = self.beforeDoctypeSystemIdentifierState
1134+
elif data in ("'", '"'):
1135+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1136+
"unexpected-char-in-doctype"})
1137+
self.stream.unget(data)
1138+
self.state = self.beforeDoctypeSystemIdentifierState
1139+
elif data is EOF:
1140+
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
1141+
"eof-in-doctype"})
1142+
self.currentToken["correct"] = False
1143+
self.tokenQueue.append(self.currentToken)
1144+
self.state = self.dataState
1145+
else:
1146+
self.stream.unget(data)
1147+
self.state = self.beforeDoctypeSystemIdentifierState
1148+
return True
1149+
10801150
def beforeDoctypeSystemIdentifierState(self):
10811151
data = self.stream.char()
10821152
if data in spaceCharacters:

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy