Skip to content

Commit 8c43a81

Browse files
committed
Performance improvement on parsing larger HTML pages.
1 parent a025d87 commit 8c43a81

File tree

5 files changed

+166
-140
lines changed

5 files changed

+166
-140
lines changed

CHANGES

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ jsoup changelog
66

77
* When fetching XML URLs, automatically switch to the XML parser instead of the HTML parser.
88
<https://github.com/jhy/jsoup/pull/574>
9+
10+
* Performance improvement on parsing larger HTML pages. On Android KitKat, around 1.7x times faster. On Android
11+
Lollipop, ~ 1.3x faster. Improvements largely from re-ordering the HtmlTreeBuilder methods based on analysis of
12+
various websites; also from further memory reduction for nodes with no children, and other tweaks.
913

1014
* Fixed an issue in Element.getElementSiblingIndex (and related methods) where sibling elements with the same content
1115
would incorrectly have the same sibling index.

src/main/java/org/jsoup/helper/StringUtil.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import java.net.MalformedURLException;
44
import java.net.URL;
5+
import java.util.Arrays;
56
import java.util.Collection;
67
import java.util.Iterator;
78

@@ -153,6 +154,10 @@ public static boolean in(String needle, String... haystack) {
153154
return false;
154155
}
155156

157+
public static boolean inSorted(String needle, String[] haystack) {
158+
return Arrays.binarySearch(haystack, needle) >= 0;
159+
}
160+
156161
/**
157162
* Create a new absolute URL, from a provided existing absolute URL and a relative URL component.
158163
* @param base the existing absolulte base URL

src/main/java/org/jsoup/nodes/Element.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ public Element appendChild(Node child) {
284284

285285
// was - Node#addChildren(child). short-circuits an array create and a loop.
286286
reparentChild(child);
287+
ensureChildNodes();
287288
childNodes.add(child);
288289
child.setSiblingIndex(childNodes.size() - 1);
289290
return this;

src/main/java/org/jsoup/nodes/Node.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
1919
@author Jonathan Hedley, jonathan@hedley.net */
2020
public abstract class Node implements Cloneable {
21+
private static final List<Node> EMPTY_NODES = Collections.emptyList();
2122
Node parentNode;
2223
List<Node> childNodes;
2324
Attributes attributes;
@@ -33,7 +34,7 @@ protected Node(String baseUri, Attributes attributes) {
3334
Validate.notNull(baseUri);
3435
Validate.notNull(attributes);
3536

36-
childNodes = new ArrayList<Node>(4);
37+
childNodes = EMPTY_NODES;
3738
this.baseUri = baseUri.trim();
3839
this.attributes = attributes;
3940
}
@@ -46,7 +47,7 @@ protected Node(String baseUri) {
4647
* Default constructor. Doesn't setup base uri, children, or attributes; use with caution.
4748
*/
4849
protected Node() {
49-
childNodes = Collections.emptyList();
50+
childNodes = EMPTY_NODES;
5051
attributes = null;
5152
}
5253

@@ -428,6 +429,7 @@ protected void addChildren(Node... children) {
428429
//most used. short circuit addChildren(int), which hits reindex children and array copy
429430
for (Node child: children) {
430431
reparentChild(child);
432+
ensureChildNodes();
431433
childNodes.add(child);
432434
child.setSiblingIndex(childNodes.size()-1);
433435
}
@@ -438,11 +440,18 @@ protected void addChildren(int index, Node... children) {
438440
for (int i = children.length - 1; i >= 0; i--) {
439441
Node in = children[i];
440442
reparentChild(in);
443+
ensureChildNodes();
441444
childNodes.add(index, in);
442445
}
443446
reindexChildren(index);
444447
}
445448

449+
protected void ensureChildNodes() {
450+
if (childNodes == EMPTY_NODES) {
451+
childNodes = new ArrayList<Node>(4);
452+
}
453+
}
454+
446455
protected void reparentChild(Node child) {
447456
if (child.parentNode != null)
448457
child.parentNode.removeChild(child);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy