Skip to content

Commit c2de86f

Browse files
committed
Merge remote-tracking branch 'origin/master' into fix_tokenizer_201411
2 parents dda96f8 + 93ee3b3 commit c2de86f

21 files changed

+138
-474
lines changed

.pytest.expect

-3.55 KB
Binary file not shown.

CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ Released on XXX
2222

2323
* Move testsuite to ``py.test``.
2424

25+
* Fix #124: move to webencodings for decoding the input byte stream;
26+
this makes html5lib compliant with the Encoding Standard, and
27+
introduces a required dependency on webencodings.
28+
2529

2630
0.9999999/1.0b8
2731
~~~~~~~~~~~~~~~

html5lib/constants.py

Lines changed: 0 additions & 229 deletions
Original file line numberDiff line numberDiff line change
@@ -2846,235 +2846,6 @@
28462846
0x9F: "\u0178",
28472847
}
28482848

2849-
encodings = {
2850-
'437': 'cp437',
2851-
'850': 'cp850',
2852-
'852': 'cp852',
2853-
'855': 'cp855',
2854-
'857': 'cp857',
2855-
'860': 'cp860',
2856-
'861': 'cp861',
2857-
'862': 'cp862',
2858-
'863': 'cp863',
2859-
'865': 'cp865',
2860-
'866': 'cp866',
2861-
'869': 'cp869',
2862-
'ansix341968': 'ascii',
2863-
'ansix341986': 'ascii',
2864-
'arabic': 'iso8859-6',
2865-
'ascii': 'ascii',
2866-
'asmo708': 'iso8859-6',
2867-
'big5': 'big5',
2868-
'big5hkscs': 'big5hkscs',
2869-
'chinese': 'gbk',
2870-
'cp037': 'cp037',
2871-
'cp1026': 'cp1026',
2872-
'cp154': 'ptcp154',
2873-
'cp367': 'ascii',
2874-
'cp424': 'cp424',
2875-
'cp437': 'cp437',
2876-
'cp500': 'cp500',
2877-
'cp775': 'cp775',
2878-
'cp819': 'windows-1252',
2879-
'cp850': 'cp850',
2880-
'cp852': 'cp852',
2881-
'cp855': 'cp855',
2882-
'cp857': 'cp857',
2883-
'cp860': 'cp860',
2884-
'cp861': 'cp861',
2885-
'cp862': 'cp862',
2886-
'cp863': 'cp863',
2887-
'cp864': 'cp864',
2888-
'cp865': 'cp865',
2889-
'cp866': 'cp866',
2890-
'cp869': 'cp869',
2891-
'cp936': 'gbk',
2892-
'cpgr': 'cp869',
2893-
'cpis': 'cp861',
2894-
'csascii': 'ascii',
2895-
'csbig5': 'big5',
2896-
'cseuckr': 'cp949',
2897-
'cseucpkdfmtjapanese': 'euc_jp',
2898-
'csgb2312': 'gbk',
2899-
'cshproman8': 'hp-roman8',
2900-
'csibm037': 'cp037',
2901-
'csibm1026': 'cp1026',
2902-
'csibm424': 'cp424',
2903-
'csibm500': 'cp500',
2904-
'csibm855': 'cp855',
2905-
'csibm857': 'cp857',
2906-
'csibm860': 'cp860',
2907-
'csibm861': 'cp861',
2908-
'csibm863': 'cp863',
2909-
'csibm864': 'cp864',
2910-
'csibm865': 'cp865',
2911-
'csibm866': 'cp866',
2912-
'csibm869': 'cp869',
2913-
'csiso2022jp': 'iso2022_jp',
2914-
'csiso2022jp2': 'iso2022_jp_2',
2915-
'csiso2022kr': 'iso2022_kr',
2916-
'csiso58gb231280': 'gbk',
2917-
'csisolatin1': 'windows-1252',
2918-
'csisolatin2': 'iso8859-2',
2919-
'csisolatin3': 'iso8859-3',
2920-
'csisolatin4': 'iso8859-4',
2921-
'csisolatin5': 'windows-1254',
2922-
'csisolatin6': 'iso8859-10',
2923-
'csisolatinarabic': 'iso8859-6',
2924-
'csisolatincyrillic': 'iso8859-5',
2925-
'csisolatingreek': 'iso8859-7',
2926-
'csisolatinhebrew': 'iso8859-8',
2927-
'cskoi8r': 'koi8-r',
2928-
'csksc56011987': 'cp949',
2929-
'cspc775baltic': 'cp775',
2930-
'cspc850multilingual': 'cp850',
2931-
'cspc862latinhebrew': 'cp862',
2932-
'cspc8codepage437': 'cp437',
2933-
'cspcp852': 'cp852',
2934-
'csptcp154': 'ptcp154',
2935-
'csshiftjis': 'shift_jis',
2936-
'csunicode11utf7': 'utf-7',
2937-
'cyrillic': 'iso8859-5',
2938-
'cyrillicasian': 'ptcp154',
2939-
'ebcdiccpbe': 'cp500',
2940-
'ebcdiccpca': 'cp037',
2941-
'ebcdiccpch': 'cp500',
2942-
'ebcdiccphe': 'cp424',
2943-
'ebcdiccpnl': 'cp037',
2944-
'ebcdiccpus': 'cp037',
2945-
'ebcdiccpwt': 'cp037',
2946-
'ecma114': 'iso8859-6',
2947-
'ecma118': 'iso8859-7',
2948-
'elot928': 'iso8859-7',
2949-
'eucjp': 'euc_jp',
2950-
'euckr': 'cp949',
2951-
'extendedunixcodepackedformatforjapanese': 'euc_jp',
2952-
'gb18030': 'gb18030',
2953-
'gb2312': 'gbk',
2954-
'gb231280': 'gbk',
2955-
'gbk': 'gbk',
2956-
'greek': 'iso8859-7',
2957-
'greek8': 'iso8859-7',
2958-
'hebrew': 'iso8859-8',
2959-
'hproman8': 'hp-roman8',
2960-
'hzgb2312': 'hz',
2961-
'ibm037': 'cp037',
2962-
'ibm1026': 'cp1026',
2963-
'ibm367': 'ascii',
2964-
'ibm424': 'cp424',
2965-
'ibm437': 'cp437',
2966-
'ibm500': 'cp500',
2967-
'ibm775': 'cp775',
2968-
'ibm819': 'windows-1252',
2969-
'ibm850': 'cp850',
2970-
'ibm852': 'cp852',
2971-
'ibm855': 'cp855',
2972-
'ibm857': 'cp857',
2973-
'ibm860': 'cp860',
2974-
'ibm861': 'cp861',
2975-
'ibm862': 'cp862',
2976-
'ibm863': 'cp863',
2977-
'ibm864': 'cp864',
2978-
'ibm865': 'cp865',
2979-
'ibm866': 'cp866',
2980-
'ibm869': 'cp869',
2981-
'iso2022jp': 'iso2022_jp',
2982-
'iso2022jp2': 'iso2022_jp_2',
2983-
'iso2022kr': 'iso2022_kr',
2984-
'iso646irv1991': 'ascii',
2985-
'iso646us': 'ascii',
2986-
'iso88591': 'windows-1252',
2987-
'iso885910': 'iso8859-10',
2988-
'iso8859101992': 'iso8859-10',
2989-
'iso885911987': 'windows-1252',
2990-
'iso885913': 'iso8859-13',
2991-
'iso885914': 'iso8859-14',
2992-
'iso8859141998': 'iso8859-14',
2993-
'iso885915': 'iso8859-15',
2994-
'iso885916': 'iso8859-16',
2995-
'iso8859162001': 'iso8859-16',
2996-
'iso88592': 'iso8859-2',
2997-
'iso885921987': 'iso8859-2',
2998-
'iso88593': 'iso8859-3',
2999-
'iso885931988': 'iso8859-3',
3000-
'iso88594': 'iso8859-4',
3001-
'iso885941988': 'iso8859-4',
3002-
'iso88595': 'iso8859-5',
3003-
'iso885951988': 'iso8859-5',
3004-
'iso88596': 'iso8859-6',
3005-
'iso885961987': 'iso8859-6',
3006-
'iso88597': 'iso8859-7',
3007-
'iso885971987': 'iso8859-7',
3008-
'iso88598': 'iso8859-8',
3009-
'iso885981988': 'iso8859-8',
3010-
'iso88599': 'windows-1254',
3011-
'iso885991989': 'windows-1254',
3012-
'isoceltic': 'iso8859-14',
3013-
'isoir100': 'windows-1252',
3014-
'isoir101': 'iso8859-2',
3015-
'isoir109': 'iso8859-3',
3016-
'isoir110': 'iso8859-4',
3017-
'isoir126': 'iso8859-7',
3018-
'isoir127': 'iso8859-6',
3019-
'isoir138': 'iso8859-8',
3020-
'isoir144': 'iso8859-5',
3021-
'isoir148': 'windows-1254',
3022-
'isoir149': 'cp949',
3023-
'isoir157': 'iso8859-10',
3024-
'isoir199': 'iso8859-14',
3025-
'isoir226': 'iso8859-16',
3026-
'isoir58': 'gbk',
3027-
'isoir6': 'ascii',
3028-
'koi8r': 'koi8-r',
3029-
'koi8u': 'koi8-u',
3030-
'korean': 'cp949',
3031-
'ksc5601': 'cp949',
3032-
'ksc56011987': 'cp949',
3033-
'ksc56011989': 'cp949',
3034-
'l1': 'windows-1252',
3035-
'l10': 'iso8859-16',
3036-
'l2': 'iso8859-2',
3037-
'l3': 'iso8859-3',
3038-
'l4': 'iso8859-4',
3039-
'l5': 'windows-1254',
3040-
'l6': 'iso8859-10',
3041-
'l8': 'iso8859-14',
3042-
'latin1': 'windows-1252',
3043-
'latin10': 'iso8859-16',
3044-
'latin2': 'iso8859-2',
3045-
'latin3': 'iso8859-3',
3046-
'latin4': 'iso8859-4',
3047-
'latin5': 'windows-1254',
3048-
'latin6': 'iso8859-10',
3049-
'latin8': 'iso8859-14',
3050-
'latin9': 'iso8859-15',
3051-
'ms936': 'gbk',
3052-
'mskanji': 'shift_jis',
3053-
'pt154': 'ptcp154',
3054-
'ptcp154': 'ptcp154',
3055-
'r8': 'hp-roman8',
3056-
'roman8': 'hp-roman8',
3057-
'shiftjis': 'shift_jis',
3058-
'tis620': 'cp874',
3059-
'unicode11utf7': 'utf-7',
3060-
'us': 'ascii',
3061-
'usascii': 'ascii',
3062-
'utf16': 'utf-16',
3063-
'utf16be': 'utf-16-be',
3064-
'utf16le': 'utf-16-le',
3065-
'utf8': 'utf-8',
3066-
'windows1250': 'cp1250',
3067-
'windows1251': 'cp1251',
3068-
'windows1252': 'cp1252',
3069-
'windows1253': 'cp1253',
3070-
'windows1254': 'cp1254',
3071-
'windows1255': 'cp1255',
3072-
'windows1256': 'cp1256',
3073-
'windows1257': 'cp1257',
3074-
'windows1258': 'cp1258',
3075-
'windows936': 'gbk',
3076-
'x-x-big5': 'big5'}
3077-
30782849
tokenTypes = {
30792850
"Doctype": 0,
30802851
"Characters": 1,

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy