Skip to content

Commit 768ba79

Browse files
committed
More stuff orking including treewalkers, parts of parse.py dom, (c)ElementTree
--HG-- branch : svgmathml extra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/branches/svgmathml%401266
1 parent 10b9010 commit 768ba79

31 files changed

+303
-264
lines changed

parse.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3.0
22
"""usage: %prog [options] filename
33
44
Parse a document to a simpletree tree, with optional profiling
@@ -9,11 +9,16 @@
99
import os
1010
from optparse import OptionParser
1111

12+
print(sys.stdout.encoding)
13+
1214
#RELEASE remove
1315
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
1416
#END RELEASE
15-
from html5lib import html5parser, liberalxmlparser, sanitizer
17+
print(sys.path)
18+
import html5lib
19+
import html5lib.html5parser as html5parser
1620
from html5lib.tokenizer import HTMLTokenizer
21+
from html5lib import treebuilders
1722
from html5lib import treebuilders, serializer, treewalkers
1823
from html5lib import constants
1924

@@ -27,8 +32,8 @@ def parse():
2732
# Try opening from the internet
2833
if f.startswith('http://'):
2934
try:
30-
import urllib, cgi
31-
f = urllib.urlopen(f)
35+
from urllib import request
36+
f = request.urlopen(f)
3237
contentType = f.headers.get('content-type')
3338
if contentType:
3439
(mediaType, params) = cgi.parse_header(contentType)
@@ -39,7 +44,7 @@ def parse():
3944
else:
4045
try:
4146
# Try opening from file system
42-
f = open(f)
47+
f = open(f, "rb")
4348
except IOError: pass
4449
except IndexError:
4550
sys.stderr.write("No filename provided. Use -h for help\n")
@@ -64,16 +69,16 @@ def parse():
6469

6570
if opts.profile:
6671
#XXX should import cProfile instead and use that
67-
import hotshot
68-
import hotshot.stats
69-
prof = hotshot.Profile('stats.prof')
70-
prof.runcall(parseMethod, f, encoding=encoding)
72+
try:
73+
import cProfile as profile
74+
except ImportError:
75+
import profile
76+
import pstats
77+
prof = profile.run('parseMethod(f, encoding=encoding)', 'prof.out')
7178
prof.close()
7279
# XXX - We should use a temp file here
73-
stats = hotshot.stats.load('stats.prof')
74-
stats.strip_dirs()
75-
stats.sort_stats('time')
76-
stats.print_stats()
80+
stats = pstats.stats('prof.out')
81+
stats.strip_dirs().sort_stats('time').print_stats()
7782
elif opts.time:
7883
import time
7984
t0 = time.time()
@@ -88,13 +93,14 @@ def parse():
8893

8994
def printOutput(parser, document, opts):
9095
if opts.encoding:
91-
print "Encoding:", parser.tokenizer.stream.charEncoding
96+
print("Encoding:", parser.tokenizer.stream.charEncoding)
9297
if opts.xml:
9398
sys.stdout.write(document.toxml("utf-8"))
9499
elif opts.tree:
95100
if not hasattr(document,'__getitem__'): document = [document]
96101
for fragment in document:
97-
print parser.tree.testSerializer(fragment).encode("utf-8")
102+
sys.stdout.write(parser.tree.testSerializer(fragment))
103+
sys.stdout.write("\n")
98104
elif opts.hilite:
99105
sys.stdout.write(document.hilite("utf-8"))
100106
elif opts.html:
@@ -103,7 +109,7 @@ def printOutput(parser, document, opts):
103109
kwargs[opt] = getattr(opts,opt)
104110
if not kwargs['quote_char']: del kwargs['quote_char']
105111
tokens = treewalkers.getTreeWalker(opts.treebuilder)(document)
106-
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens, encoding='utf-8'):
112+
for text in serializer.HTMLSerializer(**kwargs).serialize(tokens):
107113
sys.stdout.write(text)
108114
if not text.endswith('\n'): sys.stdout.write('\n')
109115
if opts.error:

src/html5lib/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,9 @@
1010
f = open("my_document.html")
1111
tree = html5lib.parse(f)
1212
"""
13-
print(__path__)
1413

15-
#from .html5parser import HTMLParser, parse
16-
#from treebuilders import getTreeBuilder
14+
from .html5parser import HTMLParser, parse
15+
from .treebuilders import getTreeBuilder
1716

1817
#from .liberalxmlparser import XMLParser, XHTMLParser
1918

src/html5lib/filters/formfiller.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
55
#
66

7-
import _base
7+
from . import _base
88

99
from html5lib.constants import spaceCharacters
10-
spaceCharacters = u"".join(spaceCharacters)
10+
spaceCharacters = "".join(spaceCharacters)
1111

1212
class SimpleFilter(_base.Filter):
1313
def __init__(self, source, fieldStorage):
@@ -29,13 +29,13 @@ def __iter__(self):
2929
input_checked_index = -1
3030
for i,(n,v) in enumerate(token["data"]):
3131
n = n.lower()
32-
if n == u"name":
32+
if n == "name":
3333
field_name = v.strip(spaceCharacters)
34-
elif n == u"type":
34+
elif n == "type":
3535
field_type = v.strip(spaceCharacters)
36-
elif n == u"checked":
36+
elif n == "checked":
3737
input_checked_index = i
38-
elif n == u"value":
38+
elif n == "value":
3939
input_value_index = i
4040

4141
value_list = self.fieldStorage.getlist(field_name)
@@ -45,20 +45,20 @@ def __iter__(self):
4545
else:
4646
value = ""
4747

48-
if field_type in (u"checkbox", u"radio"):
48+
if field_type in ("checkbox", "radio"):
4949
if value_list:
5050
if token["data"][input_value_index][1] == value:
5151
if input_checked_index < 0:
52-
token["data"].append((u"checked", u""))
52+
token["data"].append(("checked", ""))
5353
field_indices[field_name] = field_index + 1
5454
elif input_checked_index >= 0:
5555
del token["data"][input_checked_index]
5656

57-
elif field_type not in (u"button", u"submit", u"reset"):
57+
elif field_type not in ("button", "submit", "reset"):
5858
if input_value_index >= 0:
59-
token["data"][input_value_index] = (u"value", value)
59+
token["data"][input_value_index] = ("value", value)
6060
else:
61-
token["data"].append((u"value", value))
61+
token["data"].append(("value", value))
6262
field_indices[field_name] = field_index + 1
6363

6464
field_type = None
@@ -96,7 +96,7 @@ def __iter__(self):
9696
value = ""
9797
if (is_select_multiple or not is_selected_option_found) and option_value == value:
9898
if option_selected_index < 0:
99-
token["data"].append((u"selected", u""))
99+
token["data"].append(("selected", ""))
100100
field_indices[field_name] = field_index + 1
101101
is_selected_option_found = True
102102
elif option_selected_index >= 0:

src/html5lib/filters/inject_meta_charset.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import _base
1+
from . import _base
22

33
class Filter(_base.Filter):
44
def __init__(self, source, encoding):
@@ -23,7 +23,7 @@ def __iter__(self):
2323
content_index = -1
2424
for i,(name,value) in enumerate(token["data"]):
2525
if name.lower() == 'charset':
26-
token["data"][i] = (u'charset', self.encoding)
26+
token["data"][i] = ('charset', self.encoding)
2727
meta_found = True
2828
break
2929
elif name == 'http-equiv' and value.lower() == 'content-type':
@@ -32,7 +32,7 @@ def __iter__(self):
3232
content_index = i
3333
else:
3434
if has_http_equiv_content_type and content_index >= 0:
35-
token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
35+
token["data"][content_index] = ('content', 'text/html; charset=%s' % self.encoding)
3636
meta_found = True
3737

3838
elif token["name"].lower() == "head" and not meta_found:

src/html5lib/filters/iso639codes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -746,4 +746,4 @@ def isValidLangCode(value):
746746
lang, sublang = value.split('-', 1)
747747
else:
748748
lang = value
749-
return isoLang.has_key(unicode.lower(unicode(lang)))
749+
return str.lower(str(lang)) in isoLang

src/html5lib/filters/lint.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
from gettext import gettext
22
_ = gettext
33

4-
import _base
4+
from . import _base
55
from html5lib.constants import cdataElements, rcdataElements, voidElements
66

77
from html5lib.constants import spaceCharacters
8-
spaceCharacters = u"".join(spaceCharacters)
8+
spaceCharacters = "".join(spaceCharacters)
99

1010
class LintError(Exception): pass
1111

@@ -19,22 +19,22 @@ def __iter__(self):
1919
name = token["name"]
2020
if contentModelFlag != "PCDATA":
2121
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
22-
if not isinstance(name, unicode):
23-
raise LintError(_(u"Tag name is not a string: %r") % name)
22+
if not isinstance(name, str):
23+
raise LintError(_("Tag name is not a string: %r") % name)
2424
if not name:
25-
raise LintError(_(u"Empty tag name"))
25+
raise LintError(_("Empty tag name"))
2626
if type == "StartTag" and name in voidElements:
27-
raise LintError(_(u"Void element reported as StartTag token: %s") % name)
27+
raise LintError(_("Void element reported as StartTag token: %s") % name)
2828
elif type == "EmptyTag" and name not in voidElements:
29-
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
29+
raise LintError(_("Non-void element reported as EmptyTag token: %s") % token["name"])
3030
if type == "StartTag":
3131
open_elements.append(name)
3232
for name, value in token["data"]:
33-
if not isinstance(name, unicode):
33+
if not isinstance(name, str):
3434
raise LintError(_("Attribute name is not a string: %r") % name)
3535
if not name:
36-
raise LintError(_(u"Empty attribute name"))
37-
if not isinstance(value, unicode):
36+
raise LintError(_("Empty attribute name"))
37+
if not isinstance(value, str):
3838
raise LintError(_("Attribute value is not a string: %r") % value)
3939
if name in cdataElements:
4040
contentModelFlag = "CDATA"
@@ -45,15 +45,15 @@ def __iter__(self):
4545

4646
elif type == "EndTag":
4747
name = token["name"]
48-
if not isinstance(name, unicode):
49-
raise LintError(_(u"Tag name is not a string: %r") % name)
48+
if not isinstance(name, str):
49+
raise LintError(_("Tag name is not a string: %r") % name)
5050
if not name:
51-
raise LintError(_(u"Empty tag name"))
51+
raise LintError(_("Empty tag name"))
5252
if name in voidElements:
53-
raise LintError(_(u"Void element reported as EndTag token: %s") % name)
53+
raise LintError(_("Void element reported as EndTag token: %s") % name)
5454
start_name = open_elements.pop()
5555
if start_name != name:
56-
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
56+
raise LintError(_("EndTag (%s) does not match StartTag (%s)") % (name, start_name))
5757
contentModelFlag = "PCDATA"
5858

5959
elif type == "Comment":
@@ -62,27 +62,27 @@ def __iter__(self):
6262

6363
elif type in ("Characters", "SpaceCharacters"):
6464
data = token["data"]
65-
if not isinstance(data, unicode):
65+
if not isinstance(data, str):
6666
raise LintError(_("Attribute name is not a string: %r") % data)
6767
if not data:
68-
raise LintError(_(u"%s token with empty data") % type)
68+
raise LintError(_("%s token with empty data") % type)
6969
if type == "SpaceCharacters":
7070
data = data.strip(spaceCharacters)
7171
if data:
72-
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
72+
raise LintError(_("Non-space character(s) found in SpaceCharacters token: ") % data)
7373

7474
elif type == "Doctype":
7575
name = token["name"]
7676
if contentModelFlag != "PCDATA":
7777
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
78-
if not isinstance(name, unicode):
79-
raise LintError(_(u"Tag name is not a string: %r") % name)
78+
if not isinstance(name, str):
79+
raise LintError(_("Tag name is not a string: %r") % name)
8080
# XXX: what to do with token["data"] ?
8181

8282
elif type in ("ParseError", "SerializeError"):
8383
pass
8484

8585
else:
86-
raise LintError(_(u"Unknown token type: %s") % type)
86+
raise LintError(_("Unknown token type: %s") % type)
8787

8888
yield token

src/html5lib/filters/optionaltags.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import _base
1+
from . import _base
22

33
class Filter(_base.Filter):
44
def slider(self):

src/html5lib/filters/sanitizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import _base
1+
from . import _base
22
from html5lib.sanitizer import HTMLSanitizerMixin
33

44
class Filter(_base.Filter, HTMLSanitizerMixin):

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy