Skip to content

[DomCrawler] Fixed filterXPath() chaining loosing the parent DOM nodes #10958

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 21, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 103 additions & 16 deletions src/Symfony/Component/DomCrawler/Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ public function addHtmlContent($content, $charset = 'UTF-8')

$this->addDocument($dom);

$base = $this->filterXPath('descendant-or-self::base')->extract(array('href'));
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href'));

$baseHref = current($base);
if (count($base) && !empty($baseHref)) {
Expand Down Expand Up @@ -445,7 +445,7 @@ public function parents()
$nodes = array();

while ($node = $node->parentNode) {
if (1 === $node->nodeType && '_root' !== $node->nodeName) {
if (1 === $node->nodeType) {
$nodes[] = $node;
}
}
Expand Down Expand Up @@ -580,6 +580,11 @@ public function extract($attributes)
/**
* Filters the list of nodes with an XPath expression.
*
* The XPath expression is evaluated in the context of the crawler, which
* is considered as a fake parent of the elements inside it.
* This means that a child selector "div" or "./div" will match only
* the div elements of the current crawler, not their children.
*
* @param string $xpath An XPath expression
*
* @return Crawler A new instance of Crawler with the filtered list of nodes
Expand All @@ -588,15 +593,14 @@ public function extract($attributes)
*/
public function filterXPath($xpath)
{
$document = new \DOMDocument('1.0', 'UTF-8');
$root = $document->appendChild($document->createElement('_root'));
foreach ($this as $node) {
$root->appendChild($document->importNode($node, true));
}
$xpath = $this->relativize($xpath);

$domxpath = new \DOMXPath($document);
// If we dropped all expressions in the XPath while preparing it, there would be no match
if ('' === $xpath) {
return new static(null, $this->uri);
}

return new static($domxpath->query($xpath), $this->uri);
return $this->filterRelativeXPath($xpath);
}

/**
Expand All @@ -620,7 +624,8 @@ public function filter($selector)
// @codeCoverageIgnoreEnd
}

return $this->filterXPath(CssSelector::toXPath($selector));
// The CssSelector already prefixes the selector with descendant-or-self::
return $this->filterRelativeXPath(CssSelector::toXPath($selector));
}

/**
Expand All @@ -634,10 +639,10 @@ public function filter($selector)
*/
public function selectLink($value)
{
$xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
$xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')).
sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' '));

return $this->filterXPath($xpath);
return $this->filterRelativeXPath($xpath);
}

/**
Expand All @@ -652,11 +657,11 @@ public function selectLink($value)
public function selectButton($value)
{
$translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")';
$xpath = sprintf('//input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
$xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', $translate, static::xpathLiteral(' '.$value.' '), $value, $value).
sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);

return $this->filterXPath($xpath);
return $this->filterRelativeXPath($xpath);
}

/**
Expand Down Expand Up @@ -772,6 +777,88 @@ public static function xpathLiteral($s)
return sprintf("concat(%s)", implode($parts, ', '));
}

/**
* Filters the list of nodes with an XPath expression.
*
* The XPath expression should already be processed to apply it in the context of each node.
*
* @param string $xpath
*
* @return Crawler
*/
private function filterRelativeXPath($xpath)
{
$crawler = new static(null, $this->uri);

foreach ($this as $node) {
$domxpath = new \DOMXPath($node->ownerDocument);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fabpot this method will need to be adapted when merging 2.3 into 2.4 because of the XML namespace support. See https://github.com/symfony/symfony/pull/10935/files#diff-b21c94d886a3648c0f8a59ef30df1e8aR824 for the way the code should look like in 2.4 and master

$crawler->add($domxpath->query($xpath, $node));
}

return $crawler;
}

/**
* Make the XPath relative to the current context.
*
* The returned XPath will match elements matching the XPath inside the current crawler
* when running in the context of a node of the crawler.
*
* @param string $xpath
*
* @return string
*/
private function relativize($xpath)
{
$expressions = array();

$unionPattern = '/\|(?![^\[]*\])/';
// An expression which will never match to replace expressions which cannot match in the crawler
// We cannot simply drop
$nonMatchingExpression = 'a[name() = "b"]';

// Split any unions into individual expressions.
foreach (preg_split($unionPattern, $xpath) as $expression) {
$expression = trim($expression);
$parenthesis = '';

// If the union is inside some braces, we need to preserve the opening braces and apply
// the change only inside it.
if (preg_match('/^[\(\s*]+/', $expression, $matches)) {
$parenthesis = $matches[0];
$expression = substr($expression, strlen($parenthesis));
}

// BC for Symfony 2.4 and lower were elements were adding in a fake _root parent
if (0 === strpos($expression, '/_root/')) {
$expression = './'.substr($expression, 7);
}

// add prefix before absolute element selector
if (empty($expression)) {
$expression = $nonMatchingExpression;
} elseif (0 === strpos($expression, '//')) {
$expression = 'descendant-or-self::' . substr($expression, 2);
} elseif (0 === strpos($expression, './')) {
$expression = 'self::' . substr($expression, 2);
} elseif ('/' === $expression[0]) {
// the only direct child in Symfony 2.4 and lower is _root, which is already handled previously
// so let's drop the expression entirely
$expression = $nonMatchingExpression;
} elseif ('.' === $expression[0]) {
// '.' is the fake root element in Symfony 2.4 and lower, which is excluded from results
$expression = $nonMatchingExpression;
} elseif (0 === strpos($expression, 'descendant::')) {
$expression = 'descendant-or-self::' . substr($expression, strlen('descendant::'));
} elseif (0 !== strpos($expression, 'descendant-or-self::')) {
$expression = 'self::' .$expression;
}
$expressions[] = $parenthesis.$expression;
}

return implode(' | ', $expressions);
}

/**
* @param int $position
*
Expand Down
8 changes: 1 addition & 7 deletions src/Symfony/Component/DomCrawler/Field/FormField.php
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,7 @@ public function __construct(\DOMNode $node)
{
$this->node = $node;
$this->name = $node->getAttribute('name');

$this->document = new \DOMDocument('1.0', 'UTF-8');
$this->node = $this->document->importNode($this->node, true);

$root = $this->document->appendChild($this->document->createElement('_root'));
$root->appendChild($this->node);
$this->xpath = new \DOMXPath($this->document);
$this->xpath = new \DOMXPath($node->ownerDocument);

$this->initialize();
}
Expand Down
26 changes: 9 additions & 17 deletions src/Symfony/Component/DomCrawler/Form.php
Original file line number Diff line number Diff line change
Expand Up @@ -388,9 +388,7 @@ private function initialize()
{
$this->fields = new FormFieldRegistry();

$document = new \DOMDocument('1.0', 'UTF-8');
$xpath = new \DOMXPath($document);
$root = $document->appendChild($document->createElement('_root'));
$xpath = new \DOMXPath($this->node->ownerDocument);

// add submitted button if it has a valid name
if ('form' !== $this->button->nodeName && $this->button->hasAttribute('name') && $this->button->getAttribute('name')) {
Expand All @@ -400,38 +398,32 @@ private function initialize()

// temporarily change the name of the input node for the x coordinate
$this->button->setAttribute('name', $name.'.x');
$this->set(new Field\InputFormField($document->importNode($this->button, true)));
$this->set(new Field\InputFormField($this->button));

// temporarily change the name of the input node for the y coordinate
$this->button->setAttribute('name', $name.'.y');
$this->set(new Field\InputFormField($document->importNode($this->button, true)));
$this->set(new Field\InputFormField($this->button));

// restore the original name of the input node
$this->button->setAttribute('name', $name);
} else {
$this->set(new Field\InputFormField($document->importNode($this->button, true)));
$this->set(new Field\InputFormField($this->button));
}
}

// find form elements corresponding to the current form
if ($this->node->hasAttribute('id')) {
// traverse through the whole document
$node = $document->importNode($this->node->ownerDocument->documentElement, true);
$root->appendChild($node);

// corresponding elements are either descendants or have a matching HTML5 form attribute
$formId = Crawler::xpathLiteral($this->node->getAttribute('id'));
$fieldNodes = $xpath->query(sprintf('descendant::input[@form=%s] | descendant::button[@form=%s] | descendant::textarea[@form=%s] | descendant::select[@form=%s] | //form[@id=%s]//input[not(@form)] | //form[@id=%s]//button[not(@form)] | //form[@id=%s]//textarea[not(@form)] | //form[@id=%s]//select[not(@form)]', $formId, $formId, $formId, $formId, $formId, $formId, $formId, $formId), $root);

$fieldNodes = $xpath->query(sprintf('descendant::input[@form=%s] | descendant::button[@form=%s] | descendant::textarea[@form=%s] | descendant::select[@form=%s] | //form[@id=%s]//input[not(@form)] | //form[@id=%s]//button[not(@form)] | //form[@id=%s]//textarea[not(@form)] | //form[@id=%s]//select[not(@form)]', $formId, $formId, $formId, $formId, $formId, $formId, $formId, $formId));
foreach ($fieldNodes as $node) {
$this->addField($node);
}
} else {
// parent form has no id, add descendant elements only
$node = $document->importNode($this->node, true);
$root->appendChild($node);

// descendant elements with form attribute are not part of this form
$fieldNodes = $xpath->query('descendant::input[not(@form)] | descendant::button[not(@form)] | descendant::textarea[not(@form)] | descendant::select[not(@form)]', $root);
// do the xpath query with $this->node as the context node, to only find descendant elements
// however, descendant elements with form attribute are not part of this form
$fieldNodes = $xpath->query('descendant::input[not(@form)] | descendant::button[not(@form)] | descendant::textarea[not(@form)] | descendant::select[not(@form)]', $this->node);
foreach ($fieldNodes as $node) {
$this->addField($node);
}
Expand Down
67 changes: 66 additions & 1 deletion src/Symfony/Component/DomCrawler/Tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,27 @@ public function testExtract()
$this->assertEquals(array(), $this->createTestCrawler()->filterXPath('//ol')->extract('_text'), '->extract() returns an empty array if the node list is empty');
}

public function testFilterXpathComplexQueries()
{
$crawler = $this->createTestCrawler()->filterXPath('//body');

$this->assertCount(0, $crawler->filterXPath('/input'));
$this->assertCount(0, $crawler->filterXPath('/body'));
$this->assertCount(1, $crawler->filterXPath('/_root/body'));
$this->assertCount(1, $crawler->filterXPath('./body'));
$this->assertCount(4, $crawler->filterXPath('//form')->filterXPath('//button | //input'));
$this->assertCount(1, $crawler->filterXPath('body'));
$this->assertCount(6, $crawler->filterXPath('//button | //input'));
$this->assertCount(1, $crawler->filterXPath('//body'));
$this->assertCount(1, $crawler->filterXPath('descendant-or-self::body'));
$this->assertCount(1, $crawler->filterXPath('//div[@id="parent"]')->filterXPath('./div'), 'A child selection finds only the current div');
$this->assertCount(2, $crawler->filterXPath('//div[@id="parent"]')->filterXPath('descendant::div'), 'A descendant selector matches the current div and its child');
$this->assertCount(2, $crawler->filterXPath('//div[@id="parent"]')->filterXPath('//div'), 'A descendant selector matches the current div and its child');
$this->assertCount(5, $crawler->filterXPath('(//a | //div)//img'));
$this->assertCount(7, $crawler->filterXPath('((//a | //div)//img | //ul)'));
$this->assertCount(7, $crawler->filterXPath('( ( //a | //div )//img | //ul )'));
}

/**
* @covers Symfony\Component\DomCrawler\Crawler::filterXPath
*/
Expand All @@ -378,8 +399,10 @@ public function testFilterXPath()
$this->assertInstanceOf('Symfony\\Component\\DomCrawler\\Crawler', $crawler, '->filterXPath() returns a new instance of a crawler');

$crawler = $this->createTestCrawler()->filterXPath('//ul');

$this->assertCount(6, $crawler->filterXPath('//li'), '->filterXPath() filters the node list with the XPath expression');

$crawler = $this->createTestCrawler();
$this->assertCount(3, $crawler->filterXPath('//body')->filterXPath('//button')->parents(), '->filterXpath() preserves parents when chained');
}

/**
Expand Down Expand Up @@ -455,6 +478,44 @@ public function testLink()
}
}

public function testSelectLinkAndLinkFiltered()
{
$html = <<<HTML
<!DOCTYPE html>
<html lang="en">
<body>
<div id="action">
<a href="/index.php?r=site/login">Login</a>
</div>
<form id="login-form" action="/index.php?r=site/login" method="post">
<button type="submit">Submit</button>
</form>
</body>
</html>
HTML;

$crawler = new Crawler($html);
$filtered = $crawler->filterXPath("descendant-or-self::*[@id = 'login-form']");

$this->assertCount(0, $filtered->selectLink('Login'));
$this->assertCount(1, $filtered->selectButton('Submit'));

$filtered = $crawler->filterXPath("descendant-or-self::*[@id = 'action']");

$this->assertCount(1, $filtered->selectLink('Login'));
$this->assertCount(0, $filtered->selectButton('Submit'));

$this->assertCount(1, $crawler->selectLink('Login')->selectLink('Login'));
$this->assertCount(1, $crawler->selectButton('Submit')->selectButton('Submit'));
}

public function testChaining()
{
$crawler = new Crawler('<div name="a"><div name="b"><div name="c"></div></div></div>');

$this->assertEquals('a', $crawler->filterXPath('//div')->filterXPath('div')->filterXPath('div')->attr('name'));
}

public function testLinks()
{
$crawler = $this->createTestCrawler('http://example.com/bar/')->selectLink('Foo');
Expand Down Expand Up @@ -665,6 +726,10 @@ public function createTestCrawler($uri = null)
<li>Two Bis</li>
<li>Three Bis</li>
</ul>
<div id="parent">
<div id="child"></div>
</div>
<div id="sibling"><img /></div>
</body>
</html>
');
Expand Down
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy