Skip to content

Commit f47bc4f

Browse files
committed
Add start of SVG+MathML branch
--HG-- branch : svgmathml extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/branches/svgmathml%401261
1 parent bf5f514 commit f47bc4f

File tree

12 files changed

+911
-775
lines changed

12 files changed

+911
-775
lines changed

parse.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
#RELEASE remove
1313
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
1414
#END RELEASE
15-
from html5lib import html5parser, liberalxmlparser, sanitizer, tokenizer
15+
from html5lib import html5parser, liberalxmlparser, sanitizer
16+
from html5lib.tokenizer import HTMLTokenizer
1617
from html5lib import treebuilders, serializer, treewalkers
1718
from html5lib import constants
1819

@@ -80,7 +81,7 @@ def parse():
8081
t1 = time.time()
8182
printOutput(p, document, opts)
8283
t2 = time.time()
83-
print "\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1)
84+
sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0, t2-t1))
8485
else:
8586
document = parseMethod(f, encoding=encoding)
8687
printOutput(p, document, opts)

src/html5lib/constants.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@
7272
_(u"Unexpected end of file in attribute value (')."),
7373
"eof-in-attribute-value-no-quotes":
7474
_(u"Unexpected end of file in attribute value."),
75+
"unexpected-EOF-after-solidus-in-tag":
76+
_(u"Unexpected end of file in tag. Expected >"),
77+
"unexpected-character-after-soldius-in-tag":
78+
_(u"Unexpected character after / in tag. Expected >"),
7579
"expected-dashes-or-doctype":
7680
_(u"Expected '--' or 'DOCTYPE'. Not found."),
7781
"incorrect-comment":
@@ -1098,5 +1102,18 @@
10981102
"ParseError":7
10991103
}
11001104

1105+
namespaces = {
1106+
"html":"http://www.w3.org/1999/xhtml",
1107+
"mathml":"http://www.w3.org/1998/Math/MathML",
1108+
"svg":"http://www.w3.org/2000/svg",
1109+
"xlink":"http://www.w3.org/1999/xlink",
1110+
"xml":"http://www.w3.org/XML/1998/namespace",
1111+
"xmlns":"http://www.w3.org/2000/xmlns/"
1112+
}
1113+
1114+
11011115
class DataLossWarning(UserWarning):
11021116
pass
1117+
1118+
class ReparseException(Exception):
1119+
pass

src/html5lib/html5parser.py

Lines changed: 612 additions & 598 deletions
Large diffs are not rendered by default.

src/html5lib/inputstream.py

Lines changed: 102 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import types
44

55
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
6-
from constants import encodings
6+
from constants import encodings, ReparseException
77

88
#Non-unicode versions of constants for use in the pre-parser
99
spaceCharactersBytes = [str(item) for item in spaceCharacters]
@@ -16,6 +16,82 @@
1616

1717
# Cache for charsUntil()
1818
charsUntilRegEx = {}
19+
20+
class BufferedStream:
21+
"""Buffering for streams that do not have buffering of their own
22+
23+
The buffer is implemented as a list of chunks on the assumption that
24+
joining many strings will be slow since it is O(n**2)
25+
"""
26+
27+
def __init__(self, stream):
28+
self.stream = stream
29+
self.buffer = []
30+
self.position = [-1,0] #chunk number, offset
31+
32+
def tell(self):
33+
pos = 0
34+
for chunk in self.buffer[:self.position[0]]:
35+
pos += len(chunk)
36+
pos += self.position[1]
37+
return pos
38+
39+
def seek(self, pos):
40+
assert pos < self._bufferedBytes()
41+
offset = pos
42+
i = 0
43+
while len(self.buffer[i]) < offset:
44+
offset -= pos
45+
i += 1
46+
self.position = [i, offset]
47+
48+
def read(self, bytes):
49+
if not self.buffer:
50+
return self._readStream(bytes)
51+
elif (self.position[0] == len(self.buffer) and
52+
self.position[1] == len(self.buffer[-1])):
53+
return self._readStream(bytes)
54+
else:
55+
return self._readFromBuffer(bytes)
56+
57+
def _bufferedBytes(self):
58+
return sum([len(item) for item in self.buffer])
59+
60+
def _readStream(self, bytes):
61+
data = self.stream.read(bytes)
62+
self.buffer.append(data)
63+
self.position[0] += 1
64+
self.position[1] = len(data)
65+
return data
66+
67+
def _readFromBuffer(self, bytes):
68+
remainingBytes = bytes
69+
rv = []
70+
bufferIndex = self.position[0]
71+
bufferOffset = self.position[1]
72+
while bufferIndex < len(self.buffer) and remainingBytes != 0:
73+
assert remainingBytes > 0
74+
bufferedData = self.buffer[bufferIndex]
75+
76+
if remainingBytes <= len(bufferedData) - bufferOffset:
77+
bytesToRead = remainingBytes
78+
self.position = [bufferIndex, bufferOffset + bytesToRead]
79+
else:
80+
bytesToRead = len(bufferedData) - bufferOffset
81+
self.position = [bufferIndex, len(bufferedData)]
82+
bufferIndex += 1
83+
data = rv.append(bufferedData[bufferOffset:
84+
bufferOffset + bytesToRead])
85+
remainingBytes -= bytesToRead
86+
87+
bufferOffset = 0
88+
89+
if remainingBytes:
90+
rv.append(self._readStream(remainingBytes))
91+
92+
return "".join(rv)
93+
94+
1995

2096
class HTMLInputStream:
2197
"""Provides a unicode stream of characters to the HTMLTokenizer.
@@ -65,6 +141,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
65141
if (self.charEncoding[0] is None):
66142
self.charEncoding = self.detectEncoding(parseMeta, chardet)
67143

144+
self.reset()
145+
146+
def reset(self):
68147
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
69148
'replace')
70149

@@ -100,6 +179,10 @@ def openStream(self, source):
100179
self.charEncoding = ("utf-8", "certain")
101180
import cStringIO
102181
stream = cStringIO.StringIO(str(source))
182+
183+
if not(hasattr(stream, "tell") and hasattr(stream, "seek")):
184+
stream = BufferedStream(stream)
185+
103186
return stream
104187

105188
def detectEncoding(self, parseMeta=True, chardet=True):
@@ -128,7 +211,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
128211
detector.feed(buffer)
129212
detector.close()
130213
encoding = detector.result['encoding']
131-
self.seek("".join(buffers), 0)
214+
self.rawStream.seek(0)
132215
except ImportError:
133216
pass
134217
# If all else fails use the default encoding
@@ -146,16 +229,18 @@ def detectEncoding(self, parseMeta=True, chardet=True):
146229

147230
def changeEncoding(self, newEncoding):
148231
newEncoding = codecName(newEncoding)
149-
if newEncoding == "utf16":
150-
newEncoding = "utf8"
151-
232+
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
233+
newEncoding = "utf-8"
152234
if newEncoding is None:
153235
return
154236
elif newEncoding == self.charEncoding[0]:
155-
self.charEncoding = (self.charEncoding[0] and "certian")
237+
self.charEncoding = (self.charEncoding[0], "certian")
156238
else:
157-
raise NotImplementedError, "Cannot change character encoding mid stream"
158-
239+
self.rawStream.seek(0)
240+
self.reset()
241+
self.charEncoding = (newEncoding, "certian")
242+
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
243+
159244
def detectBOM(self):
160245
"""Attempts to detect at BOM at the start of the stream. If
161246
an encoding can be determined from the BOM return the name of the
@@ -182,56 +267,21 @@ def detectBOM(self):
182267

183268
# Set the read position past the BOM if one was found, otherwise
184269
# set it to the start of the stream
185-
self.seek(string, encoding and seek or 0)
270+
self.rawStream.seek(encoding and seek or 0)
186271

187272
return encoding
188273

189-
def seek(self, buffer, n):
190-
"""Unget buffer[n:]"""
191-
if hasattr(self.rawStream, 'unget'):
192-
self.rawStream.unget(buffer[n:])
193-
return
194-
195-
if hasattr(self.rawStream, 'seek'):
196-
try:
197-
self.rawStream.seek(n)
198-
return
199-
except IOError:
200-
pass
201-
202-
class BufferedStream:
203-
def __init__(self, data, stream):
204-
self.data = data
205-
self.stream = stream
206-
def read(self, chars=-1):
207-
if chars == -1 or chars > len(self.data):
208-
result = self.data
209-
self.data = ''
210-
if chars == -1:
211-
return result + self.stream.read()
212-
else:
213-
return result + self.stream.read(chars-len(result))
214-
elif not self.data:
215-
return self.stream.read(chars)
216-
else:
217-
result = self.data[:chars]
218-
self.data = self.data[chars:]
219-
return result
220-
def unget(self, data):
221-
if self.data:
222-
self.data += data
223-
else:
224-
self.data = data
225-
226-
self.rawStream = BufferedStream(buffer[n:], self.rawStream)
227-
228274
def detectEncodingMeta(self):
229275
"""Report the encoding declared by the meta element
230276
"""
231277
buffer = self.rawStream.read(self.numBytesMeta)
232278
parser = EncodingParser(buffer)
233-
self.seek(buffer, 0)
279+
self.rawStream.seek(0)
234280
encoding = parser.getEncoding()
281+
282+
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
283+
encoding = "utf-8"
284+
235285
return encoding
236286

237287
def updatePosition(self, chars):
@@ -485,13 +535,6 @@ def getEncoding(self):
485535
break
486536
if not keepParsing:
487537
break
488-
if self.encoding is not None:
489-
self.encoding = self.encoding.strip()
490-
#Spec violation that complies with hsivonen + mjs
491-
if (ascii_punctuation_re.sub("", self.encoding) in
492-
("utf16", "utf16be", "utf16le",
493-
"utf32", "utf32be", "utf32le")):
494-
self.encoding = "utf-8"
495538

496539
return self.encoding
497540

@@ -666,11 +709,12 @@ def parse(self):
666709
except StopIteration:
667710
return None
668711

712+
669713
def codecName(encoding):
670714
"""Return the python codec name corresponding to an encoding or None if the
671715
string doesn't correspond to a valid encoding."""
672-
if (encoding is not None and type(encoding) == types.StringType):
716+
if (encoding is not None and type(encoding) in types.StringTypes):
673717
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
674-
return encodings.get(canonicalName, None)
718+
return encodings.get(canonicalName, None)
675719
else:
676720
return None

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy