Skip to content

Commit cf88221

Browse files
committed
Added ::cdata node selector
Supplements #2324
1 parent 893706a commit cf88221

File tree

4 files changed

+40
-5
lines changed

4 files changed

+40
-5
lines changed

CHANGES.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* If you happen to be using any of the deprecated methods, please take the opportunity now to migrate away from them, as they will be removed in a future release.
1212

1313
### Improvements
14-
* Enhanced the `Selector` to support direct matching against nodes such as comments and text nodes. For example, you can now find an element that follows a specific comment: `::comment:contains(prices) + p` will select `p` elements immediately after a `<!-- prices: -->` comment. Supported types include `::node`, `::leafnode`, `::comment`, `::text`, and `::data`. Node contextual selectors like `::node:contains(text)`, `:matches(regex)`, and `:blank` are also supported. Introduced `Element#selectNodes(String css)` and `Element#selectNodes(String css, Class nodeType)` for direct node selection. [#2324](https://github.com/jhy/jsoup/pull/2324)
14+
* Enhanced the `Selector` to support direct matching against nodes such as comments and text nodes. For example, you can now find an element that follows a specific comment: `::comment:contains(prices) + p` will select `p` elements immediately after a `<!-- prices: -->` comment. Supported types include `::node`, `::leafnode`, `::comment`, `::text`, `::data`, and `::cdata`. Node contextual selectors like `::node:contains(text)`, `:matches(regex)`, and `:blank` are also supported. Introduced `Element#selectNodes(String css)` and `Element#selectNodes(String css, Class nodeType)` for direct node selection. [#2324](https://github.com/jhy/jsoup/pull/2324)
1515
* Added `TagSet#onNewTag(Consumer<Tag> customizer)`: register a callback that’s invoked for each new or cloned Tag when it’s inserted into the set. Enables dynamic tweaks of tag options (for example, marking all custom tags as self-closing, or everything in a given namespace as preserving whitespace).
1616
* Made `TokenQueue` and `CharacterReader` autocloseable, to ensure that they will release their buffers back to the buffer pool, for later reuse.
1717
* Added `Selector#evaluatorOf(String css)`, as a clearer way to obtain an Evaluator from a CSS query. An alias of `QueryParser.parse(String css)`.

src/main/java/org/jsoup/select/QueryParser.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import org.jsoup.internal.StringUtil;
44
import org.jsoup.helper.Validate;
5+
import org.jsoup.nodes.CDataNode;
56
import org.jsoup.nodes.Comment;
67
import org.jsoup.nodes.DataNode;
78
import org.jsoup.nodes.LeafNode;
@@ -281,6 +282,9 @@ private Evaluator parseNodeSelector() {
281282
case "data":
282283
left = new NodeEvaluator.InstanceType(DataNode.class, pseudo);
283284
break;
285+
case "cdata":
286+
left = new NodeEvaluator.InstanceType(CDataNode.class, pseudo);
287+
break;
284288
default:
285289
throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query,
286290
tq.remainder());

src/main/java/org/jsoup/select/Selector.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,11 @@
9494
<tr><td>::leafnode</td><td>Matches any leaf-node (this is, a Node which is not an Element)</td><td></td></tr>
9595
<tr><td>::comment</td><td>Matches a Comment node</td><td></td></tr>
9696
<tr><td>::text</td><td>Matches a TextNode</td><td></td></tr>
97-
<tr><td>::data</td><td>Matches a DataNode</td><td></td></tr>
98-
<tr><td>::node:contains(text)</td><td>Matches a node that has a (normalized, case-insensitive) value containing <i>text</i>.</td><td>::comment:contains(foo bar)</td></tr>
99-
<tr><td>::node:matches(regex)</td><td>Matches a node that has a value matching the regex.</td><td>::comment:matches(\\d+)</td></tr>
100-
<tr><td>::node:blank</td><td>Matches a node that has either no value, or a value of only whitespace.</td><td>::comment:not(:blank)</td></tr>
97+
<tr><td>::data</td><td>Matches a DataNode (e.g. the content of a <code>script</code> or a <code>style</code> element)</td><td></td></tr>
98+
<tr><td>::cdata</td><td>Matches a CDataNode (which are only present in XML)</td><td></td></tr>
99+
<tr><td>::node:contains(text)</td><td>Matches a node that has a (normalized, case-insensitive) value containing <i>text</i>.</td><td><code>::comment:contains(foo bar)</code></td></tr>
100+
<tr><td>::node:matches(regex)</td><td>Matches a node that has a value matching the regex.</td><td><code>::comment:matches(\\d+)</code></td></tr>
101+
<tr><td>::node:blank</td><td>Matches a node that has either no value, or a value of only whitespace.</td><td><code>::comment:not(:blank)</code></td></tr>
101102
</table>
102103
103104
<p>A word on using regular expressions in these selectors: depending on the content of the regex, you will need to quote the pattern using <b><code>Pattern.quote("regex")</code></b> for it to parse correctly through both the selector parser and the regex parser. E.g. <code>String query = "div:matches(" + Pattern.quote(regex) + ");"</code>.</p>

src/test/java/org/jsoup/select/SelectorTest.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import org.jsoup.Jsoup;
44
import org.jsoup.MultiLocaleExtension.MultiLocaleTest;
5+
import org.jsoup.nodes.CDataNode;
56
import org.jsoup.nodes.Comment;
67
import org.jsoup.nodes.Document;
78
import org.jsoup.nodes.Element;
@@ -1688,4 +1689,33 @@ public void testAncestorChain() {
16881689
assertEquals("432", nodes.get(3).nodeValue());
16891690
}
16901691

1692+
@Test void cdataNodes() {
1693+
String xml = "<body><![CDATA[One]]><p>Two</p><![CDATA[Three]]><x><![CDATA[ ]]></body>";
1694+
Document doc = Jsoup.parse(xml, Parser.xmlParser());
1695+
1696+
// via leafnode:
1697+
Nodes<CDataNode> leafnodes = doc.selectNodes("::leafnode", CDataNode.class);
1698+
assertEquals(3, leafnodes.size());
1699+
1700+
// cdata via unfiltered
1701+
Nodes<Node> nodes = doc.selectNodes("::cdata");
1702+
assertEquals(3, nodes.size());
1703+
1704+
// (not) blank:
1705+
Nodes<CDataNode> notBlanks = doc.selectNodes("::cdata:not(:blank)", CDataNode.class);
1706+
assertEquals(2, notBlanks.size());
1707+
assertEquals("One", notBlanks.get(0).nodeValue());
1708+
assertEquals("Three", notBlanks.get(1).nodeValue());
1709+
1710+
// contains:
1711+
Nodes<CDataNode> contains = doc.selectNodes("::cdata:contains(One)", CDataNode.class);
1712+
assertEquals(1, contains.size());
1713+
assertEquals("One", contains.get(0).nodeValue());
1714+
1715+
// matches:
1716+
Nodes<CDataNode> matches = doc.selectNodes("::cdata:matches(re)", CDataNode.class);
1717+
assertEquals(1, matches.size());
1718+
assertEquals("Three", matches.get(0).nodeValue());
1719+
}
1720+
16911721
}

0 commit comments

Comments
 (0)