Skip to content

Commit 0d0282b

Browse files
committed
Simplified unget code (and improved speed by ~2%)
--HG-- extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/trunk%401246
1 parent 22886b1 commit 0d0282b

File tree

1 file changed

+30
-39
lines changed

1 file changed

+30
-39
lines changed

src/html5lib/inputstream.py

Lines changed: 30 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,6 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
7373
self.chunkSize = 0
7474
self.chunkOffset = 0
7575
self.errors = []
76-
# Single-character buffer to handle 'unget'
77-
self.ungetChar = u"" # use u"" to mean 'no character' (because None means EOF)
7876

7977
# Remember the current position in the document
8078
self.positionLine = 1
@@ -257,18 +255,13 @@ def char(self):
257255
""" Read one character from the stream or queue if available. Return
258256
EOF when EOF is reached.
259257
"""
260-
char = self.ungetChar
261-
if char != u"":
262-
# Use the ungot character, and reset the buffer
263-
self.ungetChar = u""
264-
else:
265-
# Read a new chunk from the input stream if necessary
266-
if self.chunkOffset >= self.chunkSize:
267-
if not self.readChunk():
268-
return EOF
258+
# Read a new chunk from the input stream if necessary
259+
if self.chunkOffset >= self.chunkSize:
260+
if not self.readChunk():
261+
return EOF
269262

270-
char = self.chunk[self.chunkOffset]
271-
self.chunkOffset += 1
263+
char = self.chunk[self.chunkOffset]
264+
self.chunkOffset += 1
272265

273266
# Update the position attributes
274267
if char == u"\n":
@@ -317,18 +310,6 @@ def charsUntil(self, characters, opposite = False):
317310
characters.
318311
"""
319312

320-
rv = []
321-
322-
# Check the ungot character, if any.
323-
# (Since it's only a single character, don't use the regex here)
324-
char = self.ungetChar
325-
if char != u"":
326-
if char is EOF or (char in characters) != opposite:
327-
return u""
328-
else:
329-
rv.append(char)
330-
self.ungetChar = u""
331-
332313
# Use a cache of regexps to find the required characters
333314
try:
334315
chars = charsUntilRegEx[(characters, opposite)]
@@ -339,6 +320,8 @@ def charsUntil(self, characters, opposite = False):
339320
regex = u"^%s" % regex
340321
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
341322

323+
rv = []
324+
342325
while True:
343326
# Find the longest matching prefix
344327
m = chars.match(self.chunk, self.chunkOffset)
@@ -369,21 +352,29 @@ def charsUntil(self, characters, opposite = False):
369352
def unget(self, char):
370353
# Only one character is allowed to be ungotten at once - it must
371354
# be consumed again before any further call to unget
372-
assert self.ungetChar == u""
373-
374-
self.ungetChar = char
375355

376-
# Update the position attributes
377-
if char is None:
378-
pass
379-
elif char == u"\n":
380-
assert self.positionLine >= 1
381-
assert self.lastLineLength is not None
382-
self.positionLine -= 1
383-
self.positionCol = self.lastLineLength
384-
self.lastLineLength = None
385-
else:
386-
self.positionCol -= 1
356+
if char is not None:
357+
if self.chunkOffset == 0:
358+
# unget is called quite rarely, so it's a good idea to do
359+
# more work here if it saves a bit of work in the frequently
360+
# called char and charsUntil.
361+
# So, just prepend the ungotten character onto the current
362+
# chunk:
363+
self.chunk = char + self.chunk
364+
self.chunkSize += 1
365+
else:
366+
self.chunkOffset -= 1
367+
assert self.chunk[self.chunkOffset] == char
368+
369+
# Update the position attributes
370+
if char == u"\n":
371+
assert self.positionLine >= 1
372+
assert self.lastLineLength is not None
373+
self.positionLine -= 1
374+
self.positionCol = self.lastLineLength
375+
self.lastLineLength = None
376+
else:
377+
self.positionCol -= 1
387378

388379
class EncodingBytes(str):
389380
"""String-like object with an assosiated position and various extra methods

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy