Skip to content

Commit e6514a0

Browse files
committed
Merge branch 'pr/574'
2 parents 2736346 + 5236f0b commit e6514a0

File tree

5 files changed

+45
-7
lines changed

5 files changed

+45
-7
lines changed

CHANGES

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ jsoup changelog
33
*** Release 1.8.3 [PENDING]
44
* Added support for custom boolean attributes.
55
<https://github.com/jhy/jsoup/pull/555>
6+
7+
* When fetching XML URLs, automatically switch to the XML parser instead of the HTML parser.
8+
<https://github.com/jhy/jsoup/pull/574>
69

710
* Fixed an issue in Element.getElementSiblingIndex (and related methods) where sibling elements with the same content
811
would incorrectly have the same sibling index.

src/main/java/org/jsoup/Connection.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package org.jsoup;
22

33
import org.jsoup.nodes.Document;
4+
import org.jsoup.parser.HtmlTreeBuilder;
45
import org.jsoup.parser.Parser;
56

67
import java.io.IOException;
@@ -210,7 +211,8 @@ public final boolean hasBody() {
210211
Connection cookies(Map<String, String> cookies);
211212

212213
/**
213-
* Provide an alternate parser to use when parsing the response to a Document.
214+
* Provide an alternate parser to use when parsing the response to a Document. If not set, defaults to the HTML
215+
* parser, unless the response content-type is XML, in which case the XML parser is used.
214216
* @param parser alternate parser
215217
* @return this Connection, for chaining
216218
*/

src/main/java/org/jsoup/helper/HttpConnection.java

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,7 @@ public static class Request extends HttpConnection.Base<Connection.Request> impl
357357
private boolean ignoreHttpErrors = false;
358358
private boolean ignoreContentType = false;
359359
private Parser parser;
360+
private boolean parserDefined = false; // called parser(...) vs initialized in ctor
360361
private boolean validateTSLCertificates = true;
361362
private String postDataCharset = DataUtil.defaultCharset;
362363

@@ -437,6 +438,7 @@ public Collection<Connection.KeyVal> data() {
437438

438439
public Request parser(Parser parser) {
439440
this.parser = parser;
441+
parserDefined = true;
440442
return this;
441443
}
442444

@@ -470,11 +472,9 @@ public static class Response extends HttpConnection.Base<Connection.Response> im
470472
private Connection.Request req;
471473

472474
/*
473-
* For example {@code application/atom+xml;charset=utf-8}.
474-
* Stepping through it: start with {@code "application/"}, follow with word
475-
* characters up to a {@code "+xml"}, and then maybe more ({@code .*}).
475+
* Matches XML content types (like text/xml, application/xhtml+xml;charset=UTF8, etc)
476476
*/
477-
private static final Pattern xmlContentTypeRxp = Pattern.compile("application/\\w+\\+xml.*");
477+
private static final Pattern xmlContentTypeRxp = Pattern.compile("(application|text)/\\w*\\+?xml.*");
478478

479479
Response() {
480480
super();
@@ -541,12 +541,19 @@ static Response execute(Connection.Request req, Response previousResponse) throw
541541
if (contentType != null
542542
&& !req.ignoreContentType()
543543
&& !contentType.startsWith("text/")
544-
&& !contentType.startsWith("application/xml")
545544
&& !xmlContentTypeRxp.matcher(contentType).matches()
546545
)
547546
throw new UnsupportedMimeTypeException("Unhandled content type. Must be text/*, application/xml, or application/xhtml+xml",
548547
contentType, req.url().toString());
549548

549+
// switch to the XML parser if content type is xml and not parser not explicitly set
550+
if (contentType != null && xmlContentTypeRxp.matcher(contentType).matches()) {
551+
// only flip it if a HttpConnection.Request (i.e. don't presume other impls want it):
552+
if (req instanceof HttpConnection.Request && !((Request) req).parserDefined) {
553+
req.parser(Parser.xmlParser());
554+
}
555+
}
556+
550557
res.charset = DataUtil.getCharsetFromContentType(res.contentType); // may be null, readInputStream deals with it
551558
if (conn.getContentLength() != 0) { // -1 means unknown, chunked. sun throws an IO exception on 500 response with no content when trying to read body
552559
InputStream bodyStream = null;

src/main/java/org/jsoup/parser/HtmlTreeBuilder.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
/**
1212
* HTML Tree Builder; creates a DOM from Tokens.
1313
*/
14-
class HtmlTreeBuilder extends TreeBuilder {
14+
public class HtmlTreeBuilder extends TreeBuilder {
1515
// tag searches
1616
private static final String[] TagsScriptStyle = new String[]{"script", "style"};
1717
public static final String[] TagsSearchInScope = new String[]{"applet", "caption", "html", "table", "td", "th", "marquee", "object"};

src/test/java/org/jsoup/integration/UrlConnectTest.java

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,13 @@
44
import org.jsoup.HttpStatusException;
55
import org.jsoup.Jsoup;
66
import org.jsoup.UnsupportedMimeTypeException;
7+
import org.jsoup.helper.StringUtil;
78
import org.jsoup.helper.W3CDom;
89
import org.jsoup.nodes.Document;
910
import org.jsoup.nodes.FormElement;
11+
import org.jsoup.parser.HtmlTreeBuilder;
12+
import org.jsoup.parser.Parser;
13+
import org.jsoup.parser.XmlTreeBuilder;
1014
import org.junit.Ignore;
1115
import org.junit.Test;
1216

@@ -494,4 +498,26 @@ public void fetchToW3c() throws IOException {
494498
assertTrue(html.contains("jsoup"));
495499
}
496500

501+
@Test
502+
public void fetchHandlesXml() throws IOException {
503+
// should auto-detect xml and use XML parser, unless explicitly requested the html parser
504+
String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
505+
Connection con = Jsoup.connect(xmlUrl);
506+
Document doc = con.get();
507+
Connection.Request req = con.request();
508+
assertTrue(req.parser().getTreeBuilder() instanceof XmlTreeBuilder);
509+
assertEquals("<xml> <link> one </link> <table> Two </table> </xml>", StringUtil.normaliseWhitespace(doc.outerHtml()));
510+
}
511+
512+
@Test
513+
public void fetchHandlesXmlAsHtmlWhenParserSet() throws IOException {
514+
// should auto-detect xml and use XML parser, unless explicitly requested the html parser
515+
String xmlUrl = "http://direct.infohound.net/tools/parse-xml.xml";
516+
Connection con = Jsoup.connect(xmlUrl).parser(Parser.htmlParser());
517+
Document doc = con.get();
518+
Connection.Request req = con.request();
519+
assertTrue(req.parser().getTreeBuilder() instanceof HtmlTreeBuilder);
520+
assertEquals("<html> <head></head> <body> <xml> <link>one <table> Two </table> </xml> </body> </html>", StringUtil.normaliseWhitespace(doc.outerHtml()));
521+
}
522+
497523
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy