Skip to content

Commit ce43212

Browse files
committed
Rejiggered the tokeniser so it only ever unconsumes a single character. Simplified the line/column position counters. (Saves about 5% parsing time.)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401241
1 parent b7c7de7 commit ce43212

File tree

3 files changed

+268
-192
lines changed

3 files changed

+268
-192
lines changed

src/html5lib/inputstream.py

Lines changed: 85 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,17 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
7272
self.chunk = u""
7373
self.chunkSize = 0
7474
self.chunkOffset = 0
75-
self.ungetBuffer = [] # reversed list of chars from unget()
76-
self.readChars = []
7775
self.errors = []
78-
79-
self.lineLengths = []
76+
# Single-character buffer to handle 'unget'
77+
self.ungetChar = u"" # use u"" to mean 'no character' (because None means EOF)
78+
79+
# Remember the current position in the document
80+
self.positionLine = 1
81+
self.positionCol = 0
82+
# Remember the length of the last line, so unget("\n") can restore
83+
# positionCol. (Only one character can be ungot at once, so we only
84+
# need to remember the single last line.)
85+
self.lastLineLength = None
8086

8187
#Flag to indicate we may have a CR LF broken across a data chunk
8288
self._lastChunkEndsWithCR = False
@@ -219,51 +225,59 @@ def detectEncodingMeta(self):
219225
encoding = parser.getEncoding()
220226
return encoding
221227

222-
def updatePosition(self):
223-
#Remove EOF from readChars, if present
224-
if not self.readChars:
225-
return
226-
if self.readChars and self.readChars[-1] == EOF:
227-
#There may be more than one EOF in readChars so we cannot assume
228-
#readChars.index(EOF) == -1
229-
self.readChars = self.readChars[:self.readChars.index(EOF)]
230-
readChars = "".join(self.readChars)
231-
lines = readChars.split("\n")
232-
if self.lineLengths:
233-
self.lineLengths[-1] += len(lines[0])
228+
def updatePosition(self, chars):
229+
# Update the position attributes to correspond to some sequence of
230+
# read characters
231+
232+
# Find the last newline character
233+
idx = chars.rfind(u"\n")
234+
if idx == -1:
235+
# No newlines in chars
236+
self.positionCol += len(chars)
234237
else:
235-
self.lineLengths.append(len(lines[0]))
236-
for line in lines[1:]:
237-
self.lineLengths.append(len(line))
238-
self.readChars = []
239-
#print self.lineLengths
238+
# Find the last-but-one newline character
239+
idx2 = chars.rfind(u"\n", 0, idx)
240+
if idx2 == -1:
241+
# Only one newline in chars
242+
self.positionLine += 1
243+
self.lastLineLength = self.positionCol + idx
244+
self.positionCol = len(chars) - (idx + 1)
245+
else:
246+
# At least two newlines in chars
247+
newlines = chars.count(u"\n")
248+
self.positionLine += newlines
249+
self.lastLineLength = idx - (idx2 + 1)
250+
self.positionCol = len(chars) - (idx + 1)
240251

241252
def position(self):
242253
"""Returns (line, col) of the current position in the stream."""
243-
self.updatePosition()
244-
if self.lineLengths:
245-
line, col = len(self.lineLengths), self.lineLengths[-1]
246-
else:
247-
line, col = 1,0
248-
return (line, col)
254+
return (self.positionLine, self.positionCol)
249255

250256
def char(self):
251257
""" Read one character from the stream or queue if available. Return
252258
EOF when EOF is reached.
253259
"""
254-
if self.ungetBuffer:
255-
char = self.ungetBuffer.pop()
256-
self.readChars.append(char)
257-
return char
258-
259-
if self.chunkOffset >= self.chunkSize:
260-
if not self.readChunk():
261-
return EOF
262-
263-
char = self.chunk[self.chunkOffset]
264-
self.chunkOffset += 1
260+
char = self.ungetChar
261+
if char != u"":
262+
# Use the ungot character, and reset the buffer
263+
self.ungetChar = u""
264+
else:
265+
# Read a new chunk from the input stream if necessary
266+
if self.chunkOffset >= self.chunkSize:
267+
if not self.readChunk():
268+
return EOF
269+
270+
char = self.chunk[self.chunkOffset]
271+
self.chunkOffset += 1
272+
273+
# Update the position attributes
274+
if char == u"\n":
275+
self.lastLineLength = self.positionCol
276+
self.positionCol = 0
277+
self.positionLine += 1
278+
elif char is not EOF:
279+
self.positionCol += 1
265280

266-
self.readChars.append(char)
267281
return char
268282

269283
def readChunk(self, chunkSize=_defaultChunkSize):
@@ -282,20 +296,18 @@ def readChunk(self, chunkSize=_defaultChunkSize):
282296

283297
data = data.replace(u"\u0000", u"\ufffd")
284298
#Check for CR LF broken across chunks
285-
if (self._lastChunkEndsWithCR and data[0] == "\n"):
299+
if (self._lastChunkEndsWithCR and data[0] == u"\n"):
286300
data = data[1:]
287301
# Stop if the chunk is now empty
288302
if not data:
289303
return False
290-
self._lastChunkEndsWithCR = data[-1] == "\r"
291-
data = data.replace("\r\n", "\n")
292-
data = data.replace("\r", "\n")
304+
self._lastChunkEndsWithCR = data[-1] == u"\r"
305+
data = data.replace(u"\r\n", u"\n")
306+
data = data.replace(u"\r", u"\n")
293307

294-
data = unicode(data)
295308
self.chunk = data
296309
self.chunkSize = len(data)
297310

298-
self.updatePosition()
299311
return True
300312

301313
def charsUntil(self, characters, opposite = False):
@@ -307,22 +319,22 @@ def charsUntil(self, characters, opposite = False):
307319

308320
rv = []
309321

310-
# The unget buffer is typically small and rarely used, so
311-
# just check each character individually
312-
while self.ungetBuffer:
313-
if self.ungetBuffer[-1] == EOF or (self.ungetBuffer[-1] in characters) != opposite:
314-
r = u"".join(rv)
315-
self.readChars.extend(list(r))
316-
return r
322+
# Check the ungot character, if any.
323+
# (Since it's only a single character, don't use the regex here)
324+
char = self.ungetChar
325+
if char != u"":
326+
if char is EOF or (char in characters) != opposite:
327+
return u""
317328
else:
318-
rv.append(self.ungetBuffer.pop())
329+
rv.append(char)
330+
self.ungetChar = u""
319331

320332
# Use a cache of regexps to find the required characters
321333
try:
322334
chars = charsUntilRegEx[(characters, opposite)]
323335
except KeyError:
324336
for c in characters: assert(ord(c) < 128)
325-
regex = u"".join(["\\x%02x" % ord(c) for c in characters])
337+
regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
326338
if not opposite:
327339
regex = u"^%s" % regex
328340
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]*" % regex)
@@ -343,24 +355,27 @@ def charsUntil(self, characters, opposite = False):
343355
break
344356

345357
r = u"".join(rv)
346-
self.readChars.extend(list(r))
358+
self.updatePosition(r)
347359
return r
348360

349-
def unget(self, chars):
350-
self.updatePosition()
351-
if chars:
352-
l = list(chars)
353-
l.reverse()
354-
self.ungetBuffer.extend(l)
355-
#Alter the current line, col position
356-
for c in chars[::-1]:
357-
if c is None:
358-
continue
359-
elif c == '\n':
360-
assert self.lineLengths[-1] == 0
361-
self.lineLengths.pop()
362-
else:
363-
self.lineLengths[-1] -= 1
361+
def unget(self, char):
362+
# Only one character is allowed to be ungotten at once - it must
363+
# be consumed again before any further call to unget
364+
assert self.ungetChar == u""
365+
366+
self.ungetChar = char
367+
368+
# Update the position attributes
369+
if char is None:
370+
pass
371+
elif char == u"\n":
372+
assert self.positionLine >= 1
373+
assert self.lastLineLength is not None
374+
self.positionLine -= 1
375+
self.positionCol = self.lastLineLength
376+
self.lastLineLength = None
377+
else:
378+
self.positionCol -= 1
364379

365380
class EncodingBytes(str):
366381
"""String-like object with an assosiated position and various extra methods

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy