Skip to content

Commit 93ee3b3

Browse files
committed
Merge pull request #137 from gsnedders/webencodings
Fix #124: Move to webencodings for decoding the input byte stream.
2 parents 44b0bbc + 85723e2 commit 93ee3b3

File tree

11 files changed

+49
-306
lines changed

11 files changed

+49
-306
lines changed

.pytest.expect

-3.55 KB
Binary file not shown.

CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ Released on XXX
2222

2323
* Move testsuite to ``py.test``.
2424

25+
* Fix #124: move to webencodings for decoding the input byte stream;
26+
this makes html5lib compliant with the Encoding Standard, and
27+
introduces a required dependency on webencodings.
28+
2529

2630
0.9999999/1.0b8
2731
~~~~~~~~~~~~~~~

html5lib/constants.py

Lines changed: 0 additions & 229 deletions
Original file line numberDiff line numberDiff line change
@@ -2846,235 +2846,6 @@
28462846
0x9F: "\u0178",
28472847
}
28482848

2849-
encodings = {
2850-
'437': 'cp437',
2851-
'850': 'cp850',
2852-
'852': 'cp852',
2853-
'855': 'cp855',
2854-
'857': 'cp857',
2855-
'860': 'cp860',
2856-
'861': 'cp861',
2857-
'862': 'cp862',
2858-
'863': 'cp863',
2859-
'865': 'cp865',
2860-
'866': 'cp866',
2861-
'869': 'cp869',
2862-
'ansix341968': 'ascii',
2863-
'ansix341986': 'ascii',
2864-
'arabic': 'iso8859-6',
2865-
'ascii': 'ascii',
2866-
'asmo708': 'iso8859-6',
2867-
'big5': 'big5',
2868-
'big5hkscs': 'big5hkscs',
2869-
'chinese': 'gbk',
2870-
'cp037': 'cp037',
2871-
'cp1026': 'cp1026',
2872-
'cp154': 'ptcp154',
2873-
'cp367': 'ascii',
2874-
'cp424': 'cp424',
2875-
'cp437': 'cp437',
2876-
'cp500': 'cp500',
2877-
'cp775': 'cp775',
2878-
'cp819': 'windows-1252',
2879-
'cp850': 'cp850',
2880-
'cp852': 'cp852',
2881-
'cp855': 'cp855',
2882-
'cp857': 'cp857',
2883-
'cp860': 'cp860',
2884-
'cp861': 'cp861',
2885-
'cp862': 'cp862',
2886-
'cp863': 'cp863',
2887-
'cp864': 'cp864',
2888-
'cp865': 'cp865',
2889-
'cp866': 'cp866',
2890-
'cp869': 'cp869',
2891-
'cp936': 'gbk',
2892-
'cpgr': 'cp869',
2893-
'cpis': 'cp861',
2894-
'csascii': 'ascii',
2895-
'csbig5': 'big5',
2896-
'cseuckr': 'cp949',
2897-
'cseucpkdfmtjapanese': 'euc_jp',
2898-
'csgb2312': 'gbk',
2899-
'cshproman8': 'hp-roman8',
2900-
'csibm037': 'cp037',
2901-
'csibm1026': 'cp1026',
2902-
'csibm424': 'cp424',
2903-
'csibm500': 'cp500',
2904-
'csibm855': 'cp855',
2905-
'csibm857': 'cp857',
2906-
'csibm860': 'cp860',
2907-
'csibm861': 'cp861',
2908-
'csibm863': 'cp863',
2909-
'csibm864': 'cp864',
2910-
'csibm865': 'cp865',
2911-
'csibm866': 'cp866',
2912-
'csibm869': 'cp869',
2913-
'csiso2022jp': 'iso2022_jp',
2914-
'csiso2022jp2': 'iso2022_jp_2',
2915-
'csiso2022kr': 'iso2022_kr',
2916-
'csiso58gb231280': 'gbk',
2917-
'csisolatin1': 'windows-1252',
2918-
'csisolatin2': 'iso8859-2',
2919-
'csisolatin3': 'iso8859-3',
2920-
'csisolatin4': 'iso8859-4',
2921-
'csisolatin5': 'windows-1254',
2922-
'csisolatin6': 'iso8859-10',
2923-
'csisolatinarabic': 'iso8859-6',
2924-
'csisolatincyrillic': 'iso8859-5',
2925-
'csisolatingreek': 'iso8859-7',
2926-
'csisolatinhebrew': 'iso8859-8',
2927-
'cskoi8r': 'koi8-r',
2928-
'csksc56011987': 'cp949',
2929-
'cspc775baltic': 'cp775',
2930-
'cspc850multilingual': 'cp850',
2931-
'cspc862latinhebrew': 'cp862',
2932-
'cspc8codepage437': 'cp437',
2933-
'cspcp852': 'cp852',
2934-
'csptcp154': 'ptcp154',
2935-
'csshiftjis': 'shift_jis',
2936-
'csunicode11utf7': 'utf-7',
2937-
'cyrillic': 'iso8859-5',
2938-
'cyrillicasian': 'ptcp154',
2939-
'ebcdiccpbe': 'cp500',
2940-
'ebcdiccpca': 'cp037',
2941-
'ebcdiccpch': 'cp500',
2942-
'ebcdiccphe': 'cp424',
2943-
'ebcdiccpnl': 'cp037',
2944-
'ebcdiccpus': 'cp037',
2945-
'ebcdiccpwt': 'cp037',
2946-
'ecma114': 'iso8859-6',
2947-
'ecma118': 'iso8859-7',
2948-
'elot928': 'iso8859-7',
2949-
'eucjp': 'euc_jp',
2950-
'euckr': 'cp949',
2951-
'extendedunixcodepackedformatforjapanese': 'euc_jp',
2952-
'gb18030': 'gb18030',
2953-
'gb2312': 'gbk',
2954-
'gb231280': 'gbk',
2955-
'gbk': 'gbk',
2956-
'greek': 'iso8859-7',
2957-
'greek8': 'iso8859-7',
2958-
'hebrew': 'iso8859-8',
2959-
'hproman8': 'hp-roman8',
2960-
'hzgb2312': 'hz',
2961-
'ibm037': 'cp037',
2962-
'ibm1026': 'cp1026',
2963-
'ibm367': 'ascii',
2964-
'ibm424': 'cp424',
2965-
'ibm437': 'cp437',
2966-
'ibm500': 'cp500',
2967-
'ibm775': 'cp775',
2968-
'ibm819': 'windows-1252',
2969-
'ibm850': 'cp850',
2970-
'ibm852': 'cp852',
2971-
'ibm855': 'cp855',
2972-
'ibm857': 'cp857',
2973-
'ibm860': 'cp860',
2974-
'ibm861': 'cp861',
2975-
'ibm862': 'cp862',
2976-
'ibm863': 'cp863',
2977-
'ibm864': 'cp864',
2978-
'ibm865': 'cp865',
2979-
'ibm866': 'cp866',
2980-
'ibm869': 'cp869',
2981-
'iso2022jp': 'iso2022_jp',
2982-
'iso2022jp2': 'iso2022_jp_2',
2983-
'iso2022kr': 'iso2022_kr',
2984-
'iso646irv1991': 'ascii',
2985-
'iso646us': 'ascii',
2986-
'iso88591': 'windows-1252',
2987-
'iso885910': 'iso8859-10',
2988-
'iso8859101992': 'iso8859-10',
2989-
'iso885911987': 'windows-1252',
2990-
'iso885913': 'iso8859-13',
2991-
'iso885914': 'iso8859-14',
2992-
'iso8859141998': 'iso8859-14',
2993-
'iso885915': 'iso8859-15',
2994-
'iso885916': 'iso8859-16',
2995-
'iso8859162001': 'iso8859-16',
2996-
'iso88592': 'iso8859-2',
2997-
'iso885921987': 'iso8859-2',
2998-
'iso88593': 'iso8859-3',
2999-
'iso885931988': 'iso8859-3',
3000-
'iso88594': 'iso8859-4',
3001-
'iso885941988': 'iso8859-4',
3002-
'iso88595': 'iso8859-5',
3003-
'iso885951988': 'iso8859-5',
3004-
'iso88596': 'iso8859-6',
3005-
'iso885961987': 'iso8859-6',
3006-
'iso88597': 'iso8859-7',
3007-
'iso885971987': 'iso8859-7',
3008-
'iso88598': 'iso8859-8',
3009-
'iso885981988': 'iso8859-8',
3010-
'iso88599': 'windows-1254',
3011-
'iso885991989': 'windows-1254',
3012-
'isoceltic': 'iso8859-14',
3013-
'isoir100': 'windows-1252',
3014-
'isoir101': 'iso8859-2',
3015-
'isoir109': 'iso8859-3',
3016-
'isoir110': 'iso8859-4',
3017-
'isoir126': 'iso8859-7',
3018-
'isoir127': 'iso8859-6',
3019-
'isoir138': 'iso8859-8',
3020-
'isoir144': 'iso8859-5',
3021-
'isoir148': 'windows-1254',
3022-
'isoir149': 'cp949',
3023-
'isoir157': 'iso8859-10',
3024-
'isoir199': 'iso8859-14',
3025-
'isoir226': 'iso8859-16',
3026-
'isoir58': 'gbk',
3027-
'isoir6': 'ascii',
3028-
'koi8r': 'koi8-r',
3029-
'koi8u': 'koi8-u',
3030-
'korean': 'cp949',
3031-
'ksc5601': 'cp949',
3032-
'ksc56011987': 'cp949',
3033-
'ksc56011989': 'cp949',
3034-
'l1': 'windows-1252',
3035-
'l10': 'iso8859-16',
3036-
'l2': 'iso8859-2',
3037-
'l3': 'iso8859-3',
3038-
'l4': 'iso8859-4',
3039-
'l5': 'windows-1254',
3040-
'l6': 'iso8859-10',
3041-
'l8': 'iso8859-14',
3042-
'latin1': 'windows-1252',
3043-
'latin10': 'iso8859-16',
3044-
'latin2': 'iso8859-2',
3045-
'latin3': 'iso8859-3',
3046-
'latin4': 'iso8859-4',
3047-
'latin5': 'windows-1254',
3048-
'latin6': 'iso8859-10',
3049-
'latin8': 'iso8859-14',
3050-
'latin9': 'iso8859-15',
3051-
'ms936': 'gbk',
3052-
'mskanji': 'shift_jis',
3053-
'pt154': 'ptcp154',
3054-
'ptcp154': 'ptcp154',
3055-
'r8': 'hp-roman8',
3056-
'roman8': 'hp-roman8',
3057-
'shiftjis': 'shift_jis',
3058-
'tis620': 'cp874',
3059-
'unicode11utf7': 'utf-7',
3060-
'us': 'ascii',
3061-
'usascii': 'ascii',
3062-
'utf16': 'utf-16',
3063-
'utf16be': 'utf-16-be',
3064-
'utf16le': 'utf-16-le',
3065-
'utf8': 'utf-8',
3066-
'windows1250': 'cp1250',
3067-
'windows1251': 'cp1251',
3068-
'windows1252': 'cp1252',
3069-
'windows1253': 'cp1253',
3070-
'windows1254': 'cp1254',
3071-
'windows1255': 'cp1255',
3072-
'windows1256': 'cp1256',
3073-
'windows1257': 'cp1257',
3074-
'windows1258': 'cp1258',
3075-
'windows936': 'gbk',
3076-
'x-x-big5': 'big5'}
3077-
30782849
tokenTypes = {
30792850
"Doctype": 0,
30802851
"Characters": 1,

html5lib/html5parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def documentEncoding(self):
139139
"""
140140
if not hasattr(self, 'tokenizer'):
141141
return None
142-
return self.tokenizer.stream.charEncoding[0]
142+
return self.tokenizer.stream.charEncoding[0].name
143143

144144
def isHTMLIntegrationPoint(self, element):
145145
if (element.name == "annotation-xml" and

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy