diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..b18fd293 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: 'github-actions' + directory: '/' + schedule: + interval: 'weekly' diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 00000000..52349b44 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,29 @@ +name: Benchmark + +on: + - push + - pull_request + +jobs: + benchmark: + name: "Benchmark: Ruby ${{ matrix.ruby-version }}: ${{ matrix.runs-on }}" + strategy: + fail-fast: false + matrix: + ruby-version: + - '3.3' + runs-on: + - ubuntu-latest + runs-on: ${{ matrix.runs-on }} + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: ${{ matrix.ruby-version }} + - name: Install dependencies + run: | + bundle install + gem install rexml -v 3.2.6 + - name: Benchmark + run: | + rake benchmark diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..20ff87e7 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,30 @@ +name: Release +on: + push: + tags: + - "*" +jobs: + github: + name: GitHub + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + - name: Extract release note + run: | + ruby \ + -e 'print("## REXML "); \ + puts(ARGF.read.split(/^## /)[1]. \ + gsub(/ {.+?}/, ""). \ + gsub(/\[(.+?)\]\[.+?\]/) {$1})' \ + NEWS.md > release-note.md + - name: Upload to release + run: | + title=$(head -n1 release-note.md | sed -e 's/^## //') + tail -n +2 release-note.md > release-note-without-version.md + gh release create ${GITHUB_REF_NAME} \ + --discussion-category Announcements \ + --notes-file release-note-without-version.md \ + --title "${title}" + env: + GH_TOKEN: ${{ github.token }} diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 65a3bffd..0bd43457 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -3,7 +3,14 @@ on: - push - pull_request jobs: + ruby-versions-inplace: + uses: ruby/actions/.github/workflows/ruby_versions.yml@master + with: + engine: cruby-jruby + min_version: 2.5 + inplace: + needs: ruby-versions-inplace name: "Inplace: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -13,16 +20,14 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - ruby-version: - - "2.5" - - "2.6" - - "2.7" - - jruby + ruby-version: ${{ fromJson(needs.ruby-versions-inplace.outputs.versions) }} + exclude: + - {runs-on: macos-latest, ruby-version: 2.5} # include: # - runs-on: ubuntu-latest # ruby-version: truffleruby steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} @@ -30,7 +35,26 @@ jobs: - name: Test run: bundle exec rake test + frozen-string-literal: + name: frozen-string-literal + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: ruby + bundler-cache: true + - name: Test + run: bundle exec rake test RUBYOPT="--enable-frozen-string-literal" + + ruby-versions-gems: + uses: ruby/actions/.github/workflows/ruby_versions.yml@master + with: + engine: cruby-jruby + min_version: 2.6 # REXML is a default gem since Ruby 2.6 + gem: + needs: ruby-versions-gems name: "Gem: ${{ matrix.ruby-version }} on ${{ matrix.runs-on }}" runs-on: ${{ matrix.runs-on }} strategy: @@ -40,17 +64,26 @@ jobs: - ubuntu-latest - macos-latest - windows-latest - ruby-version: - - "3.0" - - head + ruby-version: ${{ fromJson(needs.ruby-versions-gems.outputs.versions) }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby-version }} - name: Install as gem run: | rake install + - name: Install test dependencies on non-Windows + if: matrix.runs-on != 'windows-latest' + run: | + for gem in $(ruby -e 'puts ARGF.read[/^group :test do(.*)^end/m, 1].scan(/"(.+?)"/)' Gemfile); do + gem install ${gem} + done + - name: Install test dependencies on Windows + if: matrix.runs-on == 'windows-latest' + run: | + gem install test-unit + gem install test-unit-ruby-core - name: Test run: | ruby -run -e mkdir -- tmp @@ -62,17 +95,17 @@ jobs: name: "Document" runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: - ruby-version: 2.7 + ruby-version: ruby - name: Install dependencies run: | bundle install - name: Build document run: | bundle exec rake warning:error rdoc - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 if: | github.event_name == 'push' with: diff --git a/Gemfile b/Gemfile index 54da2c0c..67f21dfb 100644 --- a/Gemfile +++ b/Gemfile @@ -4,3 +4,17 @@ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } # Specify your gem's dependencies in rexml.gemspec gemspec + +group :development do + gem "bundler" + gem "rake" +end + +group :benchmark do + gem "benchmark_driver" +end + +group :test do + gem "test-unit" + gem "test-unit-ruby-core" +end diff --git a/NEWS.md b/NEWS.md index 84bbde2d..72318b7f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,15 +1,349 @@ # News +## 3.3.3 - 2024-08-01 {#version-3-3-3} + +### Improvements + + * Added support for detecting invalid XML that has unsupported + content before root element + * GH-184 + * Patch by NAITOH Jun. + + * Added support for `REXML::Security.entity_expansion_limit=` and + `REXML::Security.entity_expansion_text_limit=` in SAX2 and pull + parsers + * GH-187 + * Patch by NAITOH Jun. + + * Added more tests for invalid XMLs. + * GH-183 + * Patch by Watson. + + * Added more performance tests. + * Patch by Watson. + + * Improved parse performance. + * GH-186 + * Patch by tomoya ishida. + +### Thanks + + * NAITOH Jun + + * Watson + + * tomoya ishida + +## 3.3.2 - 2024-07-16 {#version-3-3-2} + +### Improvements + + * Improved parse performance. + * GH-160 + * Patch by NAITOH Jun. + + * Improved parse performance. + * GH-169 + * GH-170 + * GH-171 + * GH-172 + * GH-173 + * GH-174 + * GH-175 + * GH-176 + * GH-177 + * Patch by Watson. + + * Added support for raising a parse exception when an XML has extra + content after the root element. + * GH-161 + * Patch by NAITOH Jun. + + * Added support for raising a parse exception when an XML + declaration exists in wrong position. + * GH-162 + * Patch by NAITOH Jun. + + * Removed needless a space after XML declaration in pretty print mode. + * GH-164 + * Patch by NAITOH Jun. + + * Stopped to emit `:text` event after the root element. + * GH-167 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that SAX2 parser doesn't expand predefined entities for + `characters` callback. + * GH-168 + * Patch by NAITOH Jun. + +### Thanks + + * NAITOH Jun + + * Watson + +## 3.3.1 - 2024-06-25 {#version-3-3-1} + +### Improvements + + * Added support for detecting malformed top-level comments. + * GH-145 + * Patch by Hiroya Fujinami. + + * Improved `REXML::Element#attribute` performance. + * GH-146 + * Patch by Hiroya Fujinami. + + * Added support for detecting malformed `` comments. + * GH-147 + * Patch by Hiroya Fujinami. + + * Added support for detecting unclosed `DOCTYPE`. + * GH-152 + * Patch by Hiroya Fujinami. + + * Added `changlog_uri` metadata to gemspec. + * GH-156 + * Patch by fynsta. + + * Improved parse performance. + * GH-157 + * GH-158 + * Patch by NAITOH Jun. + +### Fixes + + * Fixed a bug that large XML can't be parsed. + * GH-154 + * Patch by NAITOH Jun. + + * Fixed a bug that private constants are visible. + * GH-155 + * Patch by NAITOH Jun. + +### Thanks + + * Hiroya Fujinami + + * NAITOH Jun + + * fynsta + +## 3.3.0 - 2024-06-11 {#version-3-3-0} + +### Improvements + + * Added support for strscan 0.7.0 installed with Ruby 2.6. + * GH-142 + * Reported by Fernando Trigoso. + +### Thanks + + * Fernando Trigoso + +## 3.2.9 - 2024-06-09 {#version-3-2-9} + +### Improvements + + * Added support for old strscan. + * GH-132 + * Reported by Adam. + + * Improved attribute value parse performance. + * GH-135 + * Patch by NAITOH Jun. + + * Improved `REXML::Node#each_recursive` performance. + * GH-134 + * GH-139 + * Patch by Hiroya Fujinami. + + * Improved text parse performance. + * Reported by mprogrammer. + +### Thanks + + * Adam + * NAITOH Jun + * Hiroya Fujinami + * mprogrammer + +## 3.2.8 - 2024-05-16 {#version-3-2-8} + +### Fixes + + * Suppressed a warning + +## 3.2.7 - 2024-05-16 {#version-3-2-7} + +### Improvements + + * Improve parse performance by using `StringScanner`. + + * GH-106 + * GH-107 + * GH-108 + * GH-109 + * GH-112 + * GH-113 + * GH-114 + * GH-115 + * GH-116 + * GH-117 + * GH-118 + * GH-119 + * GH-121 + + * Patch by NAITOH Jun. + + * Improved parse performance when an attribute has many `<`s. + + * GH-126 + +### Fixes + + * XPath: Fixed a bug of `normalize_space(array)`. + + * GH-110 + * GH-111 + + * Patch by flatisland. + + * XPath: Fixed a bug that wrong position is used with nested path. + + * GH-110 + * GH-122 + + * Reported by jcavalieri. + * Patch by NAITOH Jun. + + * Fixed a bug that an exception message can't be generated for + invalid encoding XML. + + * GH-29 + * GH-123 + + * Reported by DuKewu. + * Patch by NAITOH Jun. + +### Thanks + + * NAITOH Jun + * flatisland + * jcavalieri + * DuKewu + +## 3.2.6 - 2023-07-27 {#version-3-2-6} + +### Improvements + + * Required Ruby 2.5 or later explicitly. + [GH-69][gh-69] + [Patch by Ivo Anjo] + + * Added documentation for maintenance cycle. + [GH-71][gh-71] + [Patch by Ivo Anjo] + + * Added tutorial. + [GH-77][gh-77] + [GH-78][gh-78] + [Patch by Burdette Lamar] + + * Improved performance and memory usage. + [GH-94][gh-94] + [Patch by fatkodima] + + * `REXML::Parsers::XPathParser#abbreviate`: Added support for + function arguments. + [GH-95][gh-95] + [Reported by pulver] + + * `REXML::Parsers::XPathParser#abbreviate`: Added support for string + literal that contains double-quote. + [GH-96][gh-96] + [Patch by pulver] + + * `REXML::Parsers::XPathParser#abbreviate`: Added missing `/` to + `:descendant_or_self/:self/:parent`. + [GH-97][gh-97] + [Reported by pulver] + + * `REXML::Parsers::XPathParser#abbreviate`: Added support for more patterns. + [GH-97][gh-97] + [Reported by pulver] + +### Fixes + + * Fixed a typo in NEWS. + [GH-72][gh-72] + [Patch by Spencer Goodman] + + * Fixed a typo in NEWS. + [GH-75][gh-75] + [Patch by Andrew Bromwich] + + * Fixed documents. + [GH-87][gh-87] + [Patch by Alexander Ilyin] + + * Fixed a bug that `Attriute` convert `'` and `'` even when + `attribute_quote: :quote` is used. + [GH-92][gh-92] + [Reported by Edouard Brière] + + * Fixed links in tutorial. + [GH-99][gh-99] + [Patch by gemmaro] + + +### Thanks + + * Ivo Anjo + + * Spencer Goodman + + * Andrew Bromwich + + * Burdette Lamar + + * Alexander Ilyin + + * Edouard Brière + + * fatkodima + + * pulver + + * gemmaro + +[gh-69]:https://github.com/ruby/rexml/issues/69 +[gh-71]:https://github.com/ruby/rexml/issues/71 +[gh-72]:https://github.com/ruby/rexml/issues/72 +[gh-75]:https://github.com/ruby/rexml/issues/75 +[gh-77]:https://github.com/ruby/rexml/issues/77 +[gh-87]:https://github.com/ruby/rexml/issues/87 +[gh-92]:https://github.com/ruby/rexml/issues/92 +[gh-94]:https://github.com/ruby/rexml/issues/94 +[gh-95]:https://github.com/ruby/rexml/issues/95 +[gh-96]:https://github.com/ruby/rexml/issues/96 +[gh-97]:https://github.com/ruby/rexml/issues/97 +[gh-98]:https://github.com/ruby/rexml/issues/98 +[gh-99]:https://github.com/ruby/rexml/issues/99 + ## 3.2.5 - 2021-04-05 {#version-3-2-5} ### Improvements * Add more validations to XPath parser. - * `require "rexml/docuemnt"` by default. + * `require "rexml/document"` by default. [GitHub#36][Patch by Koichi ITO] - * Don't add `#dcloe` method to core classes globally. + * Don't add `#dclone` method to core classes globally. [GitHub#37][Patch by Akira Matsuda] * Add more documentations. diff --git a/README.md b/README.md index 27da0e49..e8ab5082 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ REXML supports both tree and stream document parsing. Stream parsing is faster ( ## API -See the {API documentation}[https://ruby.github.io/rexml/] +See the [API documentation](https://ruby.github.io/rexml/). ## Usage @@ -33,6 +33,15 @@ doc = Document.new string So parsing a string is just as easy as parsing a file. +## Support + +REXML support follows the same maintenance cycle as Ruby releases, as shown on . + +If you are running on an end-of-life Ruby, do not expect modern REXML releases to be compatible with it; in fact, it's recommended that you DO NOT use this gem, and instead use the REXML version that came bundled with your end-of-life Ruby version. + +The `required_ruby_version` on the gemspec is kept updated on a [best-effort basis](https://github.com/ruby/rexml/pull/70) by the community. +Up to version 3.2.5, this information was not set. That version [is known broken with at least Ruby < 2.3](https://github.com/ruby/rexml/issues/69). + ## Development After checking out the repo, run `rake test` to run the tests. diff --git a/Rakefile b/Rakefile index 7143e754..76a56296 100644 --- a/Rakefile +++ b/Rakefile @@ -28,3 +28,42 @@ RDoc::Task.new do |rdoc| end load "#{__dir__}/tasks/tocs.rake" + +benchmark_tasks = [] +namespace :benchmark do + Dir.glob("benchmark/*.yaml").sort.each do |yaml| + name = File.basename(yaml, ".*") + env = { + "RUBYLIB" => nil, + "BUNDLER_ORIG_RUBYLIB" => nil, + } + command_line = [ + RbConfig.ruby, "-v", "-S", "benchmark-driver", File.expand_path(yaml), + ] + + desc "Run #{name} benchmark" + task name do + puts("```") + sh(env, *command_line) + puts("```") + end + benchmark_tasks << "benchmark:#{name}" + + case name + when /\Aparse/ + namespace name do + desc "Run #{name} benchmark: small" + task :small do + puts("```") + sh(env.merge("N_ELEMENTS" => "500", "N_ATTRIBUTES" => "1"), + *command_line) + puts("```") + end + benchmark_tasks << "benchmark:#{name}:small" + end + end + end +end + +desc "Run all benchmarks" +task :benchmark => benchmark_tasks diff --git a/benchmark/attribute.yaml b/benchmark/attribute.yaml new file mode 100644 index 00000000..5dd7fded --- /dev/null +++ b/benchmark/attribute.yaml @@ -0,0 +1,38 @@ +loop_count: 1000 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + xml_source = "" + 100.times do + xml_source = "#{xml_source}" + end + xml_source = "#{xml_source}" + + document = REXML::Document.new(xml_source) + deepest_node = document.elements["//deepest"] + +benchmark: + with_ns: deepest_node.attribute("with_ns", "xyz") + without_ns: deepest_node.attribute("without_ns") diff --git a/benchmark/each_recursive.yaml b/benchmark/each_recursive.yaml new file mode 100644 index 00000000..c745f8ce --- /dev/null +++ b/benchmark/each_recursive.yaml @@ -0,0 +1,40 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + + xml_source = +"" + 100.times do + x_node_source = "" + 100.times do + x_node_source = "#{x_node_source}" + end + xml_source << x_node_source + end + xml_source << "" + + document = REXML::Document.new(xml_source) + +benchmark: + each_recursive: document.each_recursive { |_| } diff --git a/benchmark/gt.yaml b/benchmark/gt.yaml new file mode 100644 index 00000000..3f6af739 --- /dev/null +++ b/benchmark/gt.yaml @@ -0,0 +1,34 @@ +loop_count: 10 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require "rexml" + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require "rexml" + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require "rexml" + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require "rexml" + RubyVM::YJIT.enable + +prelude: | + require "rexml/document" + + n = 10000 + gts = ">" * n + in_attribute = "" + in_text = "#{gts}" + +benchmark: + "attribute": REXML::Document.new(in_attribute) + "text": REXML::Document.new(in_text) diff --git a/benchmark/parse.yaml b/benchmark/parse.yaml new file mode 100644 index 00000000..f2c7d336 --- /dev/null +++ b/benchmark/parse.yaml @@ -0,0 +1,57 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + require 'rexml/parsers/sax2parser' + require 'rexml/parsers/pullparser' + require 'rexml/parsers/streamparser' + require 'rexml/streamlistener' + + n_elements = Integer(ENV.fetch("N_ELEMENTS", "5000"), 10) + n_attributes = Integer(ENV.fetch("N_ATTRIBUTES", "2"), 10) + + def build_xml(n_elements, n_attributes) + xml = '' + n_elements.times do |i| + xml << '' + end + xml << '' + end + xml = build_xml(n_elements, n_attributes) + + class Listener + include REXML::StreamListener + end + +benchmark: + 'dom' : REXML::Document.new(xml) + 'sax' : REXML::Parsers::SAX2Parser.new(xml).parse + 'pull' : | + parser = REXML::Parsers::PullParser.new(xml) + while parser.has_next? + parser.pull + end + 'stream' : REXML::Parsers::StreamParser.new(xml, Listener.new).parse diff --git a/doc/rexml/tasks/rdoc/element.rdoc b/doc/rexml/tasks/rdoc/element.rdoc index f229275f..4b3609b0 100644 --- a/doc/rexml/tasks/rdoc/element.rdoc +++ b/doc/rexml/tasks/rdoc/element.rdoc @@ -369,7 +369,7 @@ to retrieve the first text node in a specified element: Use method {Element#has_text?}[../../../../REXML/Element.html#method-i-has_text-3F] -to determine whethe the element has text: +to determine whether the element has text: e = REXML::Element.new('foo') e.has_text? # => false @@ -486,7 +486,7 @@ to remove a specific namespace from the element: Use method {Element#namespace}[../../../../REXML/Element.html#method-i-namespace] -to retrieve a speficic namespace URI for the element: +to retrieve a specific namespace URI for the element: xml_string = <<-EOT diff --git a/doc/rexml/tutorial.rdoc b/doc/rexml/tutorial.rdoc new file mode 100644 index 00000000..c85a70d0 --- /dev/null +++ b/doc/rexml/tutorial.rdoc @@ -0,0 +1,1358 @@ += \REXML Tutorial + +== Why \REXML? + +- Ruby's \REXML library is part of the Ruby distribution, + so using it requires no gem installations. +- \REXML is fully maintained. +- \REXML is mature, having been in use for long years. + +== To Include, or Not to Include? + +REXML is a module. +To use it, you must require it: + + require 'rexml' # => true + +If you do not also include it, you must fully qualify references to REXML: + + REXML::Document # => REXML::Document + +If you also include the module, you may optionally omit REXML::: + + include REXML + Document # => REXML::Document + REXML::Document # => REXML::Document + +== Preliminaries + +All examples here assume that the following code has been executed: + + require 'rexml' + include REXML + +The source XML for many examples here is from file +{books.xml}[https://www.w3schools.com/xml/books.xml] at w3schools.com. +You may find it convenient to open that page in a new tab +(Ctrl-click in some browsers). + +Note that your browser may display the XML with modified whitespace +and without the XML declaration, which in this case is: + + + +For convenience, we capture the XML into a string variable: + + require 'open-uri' + source_string = URI.open('https://www.w3schools.com/xml/books.xml').read + +And into a file: + + File.write('source_file.xml', source_string) + +Throughout these examples, variable +doc+ will hold only the document +derived from these sources: + + doc = Document.new(source_string) + +== Parsing \XML \Source + +=== Parsing a Document + +Use method REXML::Document::new to parse XML source. + +The source may be a string: + + doc = Document.new(source_string) + +Or an \IO stream: + + doc = File.open('source_file.xml', 'r') do |io| + Document.new(io) + end + +Method URI.open returns a StringIO object, +so the source can be from a web page: + + require 'open-uri' + io = URI.open("https://www.w3schools.com/xml/books.xml") + io.class # => StringIO + doc = Document.new(io) + +For any of these sources, the returned object is an REXML::Document: + + doc # => ... + doc.class # => REXML::Document + +Note: 'UNDEFINED' is the "name" displayed for a document, +even though doc.name returns an empty string "". + +A parsed document may produce \REXML objects of many classes, +but the two that are likely to be of greatest interest are +REXML::Document and REXML::Element. +These two classes are covered in great detail in this tutorial. + +=== Context (Parsing Options) + +The context for parsing a document is a hash that influences +the way the XML is read and stored. + +The context entries are: + +- +:respect_whitespace+: controls treatment of whitespace. +- +:compress_whitespace+: determines whether whitespace is compressed. +- +:ignore_whitespace_nodes+: determines whether whitespace-only nodes are to be ignored. +- +:raw+: controls treatment of special characters and entities. + +See {Element Context}[../context_rdoc.html]. + +== Exploring the Document + +An REXML::Document object represents an XML document. + +The object inherits from its ancestor classes: + +- REXML::Child (includes module REXML::Node) + - REXML::Parent (includes module {Enumerable}[rdoc-ref:Enumerable]). + - REXML::Element (includes module REXML::Namespace). + - REXML::Document + +This section covers only those properties and methods that are unique to a document +(that is, not inherited or included). + +=== Document Properties + +A document has several properties (other than its children); + +- Document type. +- Node type. +- Name. +- Document. +- XPath + +[Document Type] + + A document may have a document type: + + my_xml = '' + my_doc = Document.new(my_xml) + doc_type = my_doc.doctype + doc_type.class # => REXML::DocType + doc_type.to_s # => "" + +[Node Type] + + A document also has a node type (always +:document+): + + doc.node_type # => :document + +[Name] + + A document has a name (always an empty string): + + doc.name # => "" + +[Document] + + \Method REXML::Document#document returns +self+: + + doc.document == doc # => true + + An object of a different class (\REXML::Element or \REXML::Child) + may have a document, which is the document to which the object belongs; + if so, that document will be an \REXML::Document object. + + doc.root.document.class # => REXML::Document + +[XPath] + + \method REXML::Element#xpath returns the string xpath to the element, + relative to its most distant ancestor: + + doc.root.class # => REXML::Element + doc.root.xpath # => "/bookstore" + doc.root.texts.first # => "\n\n" + doc.root.texts.first.xpath # => "/bookstore/text()" + + If there is no ancestor, returns the expanded name of the element: + + Element.new('foo').xpath # => "foo" + +=== Document Children + +A document may have children of these types: + +- XML declaration. +- Root element. +- Text. +- Processing instructions. +- Comments. +- CDATA. + +[XML Declaration] + + A document may an XML declaration, which is stored as an REXML::XMLDecl object: + + doc.xml_decl # => + doc.xml_decl.class # => REXML::XMLDecl + + Document.new('').xml_decl # => + + my_xml = '"' + my_doc = Document.new(my_xml) + xml_decl = my_doc.xml_decl + xml_decl.to_s # => "" + + The version, encoding, and stand-alone values may be retrieved separately: + + my_doc.version # => "1.0" + my_doc.encoding # => "UTF-8" + my_doc.stand_alone? # => "yes" + +[Root Element] + + A document may have a single element child, called the _root_ _element_, + which is stored as an REXML::Element object; + it may be retrieved with method +root+: + + doc.root # => ... + doc.root.class # => REXML::Element + + Document.new('').root # => nil + +[Text] + + A document may have text passages, each of which is stored + as an REXML::Text object: + + doc.texts.each {|t| p [t.class, t] } + + Output: + + [REXML::Text, "\n"] + +[Processing Instructions] + + A document may have processing instructions, which are stored + as REXML::Instruction objects: + + + + Output: + + [REXML::Instruction, ] + [REXML::Instruction, ] + +[Comments] + + A document may have comments, which are stored + as REXML::Comment objects: + + my_xml = <<-EOT + + + EOT + my_doc = Document.new(my_xml) + my_doc.comments.each {|c| p [c.class, c] } + + Output: + + [REXML::Comment, # ... , @string="foo">] + [REXML::Comment, # ... , @string="bar">] + +[CDATA] + + A document may have CDATA entries, which are stored + as REXML::CData objects: + + my_xml = <<-EOT + + + EOT + my_doc = Document.new(my_xml) + my_doc.cdatas.each {|cd| p [cd.class, cd] } + + Output: + + [REXML::CData, "foo"] + [REXML::CData, "bar"] + +The payload of a document is a tree of nodes, descending from the root element: + + doc.root.children.each do |child| + p [child, child.class] + end + +Output: + + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + +== Exploring an Element + +An REXML::Element object represents an XML element. + +The object inherits from its ancestor classes: + +- REXML::Child (includes module REXML::Node) + - REXML::Parent (includes module {Enumerable}[rdoc-ref:Enumerable]). + - REXML::Element (includes module REXML::Namespace). + +This section covers methods: + +- Defined in REXML::Element itself. +- Inherited from REXML::Parent and REXML::Child. +- Included from REXML::Node. + +=== Inside the Element + +[Brief String Representation] + + Use method REXML::Element#inspect to retrieve a brief string representation. + + doc.root.inspect # => " ... " + + The ellipsis (...) indicates that the element has children. + When there are no children, the ellipsis is omitted: + + Element.new('foo').inspect # => "" + + If the element has attributes, those are also included: + + doc.root.elements.first.inspect # => " ... " + +[Extended String Representation] + + Use inherited method REXML::Child.bytes to retrieve an extended + string representation. + + doc.root.bytes # => "\n\n\n Everyday Italian\n Giada De Laurentiis\n 2005\n 30.00\n\n\n\n Harry Potter\n J K. Rowling\n 2005\n 29.99\n\n\n\n XQuery Kick Start\n James McGovern\n Per Bothner\n Kurt Cagle\n James Linn\n Vaidyanathan Nagarajan\n 2003\n 49.99\n\n\n\n Learning XML\n Erik T. Ray\n 2003\n 39.95\n\n\n" + +[Node Type] + + Use method REXML::Element#node_type to retrieve the node type (always +:element+): + + doc.root.node_type # => :element + +[Raw Mode] + + Use method REXML::Element#raw to retrieve whether (+true+ or +nil+) + raw mode is set. + + doc.root.raw # => nil + +[Context] + + Use method REXML::Element#context to retrieve the context hash + (see {Element Context}[../context_rdoc.html]): + + doc.root.context # => {} + +=== Relationships + +An element may have: + +- Ancestors. +- Siblings. +- Children. + +==== Ancestors + +[Containing Document] + + Use method REXML::Element#document to retrieve the containing document, if any: + + ele = doc.root.elements.first # => ... + ele.document # => ... + ele = Element.new('foo') # => + ele.document # => nil + +[Root Element] + + Use method REXML::Element#root to retrieve the root element: + + ele = doc.root.elements.first # => ... + ele.root # => ... + ele = Element.new('foo') # => + ele.root # => + +[Root Node] + + Use method REXML::Element#root_node to retrieve the most distant ancestor, + which is the containing document, if any, otherwise the root element: + + ele = doc.root.elements.first # => ... + ele.root_node # => ... + ele = Element.new('foo') # => + ele.root_node # => + +[Parent] + + Use inherited method REXML::Child#parent to retrieve the parent + + ele = doc.root # => ... + ele.parent # => ... + ele = doc.root.elements.first # => ... + ele.parent # => ... + + Use included method REXML::Node#index_in_parent to retrieve the index + of the element among all of its parents children (not just the element children). + Note that while the index for doc.root.elements[n] is 1-based, + the returned index is 0-based. + + doc.root.children # => + # ["\n\n", + # ... , + # "\n\n", + # ... , + # "\n\n", + # ... , + # "\n\n", + # ... , + # "\n\n"] + ele = doc.root.elements[1] # => ... + ele.index_in_parent # => 2 + ele = doc.root.elements[2] # => ... + ele.index_in_parent# => 4 + +==== Siblings + +[Next Element] + + Use method REXML::Element#next_element to retrieve the first following + sibling that is itself an element (+nil+ if there is none): + + ele = doc.root.elements[1] + while ele do + p [ele.class, ele] + ele = ele.next_element + end + p ele + + Output: + + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + +[Previous Element] + + Use method REXML::Element#previous_element to retrieve the first preceding + sibling that is itself an element (+nil+ if there is none): + + ele = doc.root.elements[4] + while ele do + p [ele.class, ele] + ele = ele.previous_element + end + p ele + + Output: + + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + +[Next Node] + + Use included method REXML::Node.next_sibling_node + (or its alias next_sibling) to retrieve the first following node + regardless of its class: + + node = doc.root.children[0] + while node do + p [node.class, node] + node = node.next_sibling + end + p node + + Output: + + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + +[Previous Node] + + Use included method REXML::Node.previous_sibling_node + (or its alias previous_sibling) to retrieve the first preceding node + regardless of its class: + + node = doc.root.children[-1] + while node do + p [node.class, node] + node = node.previous_sibling + end + p node + + Output: + + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + +==== Children + +[Child Count] + + Use inherited method REXML::Parent.size to retrieve the count + of nodes (of all types) in the element: + + doc.root.size # => 9 + +[Child Nodes] + + Use inherited method REXML::Parent.children to retrieve an array + of the child nodes (of all types): + + doc.root.children # => + # ["\n\n", + # ... , + # "\n\n", + # ... , + # "\n\n", + # ... , + # "\n\n", + # ... , + # "\n\n"] + +[Child at Index] + + Use method REXML::Element#[] to retrieve the child at a given numerical index, + or +nil+ if there is no such child: + + doc.root[0] # => "\n\n" + doc.root[1] # => ... + doc.root[7] # => ... + doc.root[8] # => "\n\n" + + doc.root[-1] # => "\n\n" + doc.root[-2] # => ... + + doc.root[50] # => nil + +[Index of Child] + + Use method REXML::Parent#index to retrieve the zero-based child index + of the given object, or #size - 1 if there is no such child: + + ele = doc.root # => ... + ele.index(ele[0]) # => 0 + ele.index(ele[1]) # => 1 + ele.index(ele[7]) # => 7 + ele.index(ele[8]) # => 8 + + ele.index(ele[-1]) # => 8 + ele.index(ele[-2]) # => 7 + + ele.index(ele[50]) # => 8 + +[Element Children] + + Use method REXML::Element#has_elements? to retrieve whether the element + has element children: + + doc.root.has_elements? # => true + REXML::Element.new('foo').has_elements? # => false + + Use method REXML::Element#elements to retrieve the REXML::Elements object + containing the element children: + + eles = doc.root.elements + eles # => # ... > + eles.size # => 4 + eles.each {|e| p [e.class], e } + + Output: + + [ ... , + ... , + ... , + ... + ] + +Note that while in this example, all the element children of the root element are +elements of the same name, 'book', that is not true of all documents; +a root element (or any other element) may have any mixture of child elements. + +[CDATA Children] + + Use method REXML::Element#cdatas to retrieve a frozen array of CDATA children: + + my_xml = <<-EOT + + + + + EOT + my_doc = REXML::Document.new(my_xml) + cdatas my_doc.root.cdatas + cdatas.frozen? # => true + cdatas.map {|cd| cd.class } # => [REXML::CData, REXML::CData] + +[Comment Children] + + Use method REXML::Element#comments to retrieve a frozen array of comment children: + + my_xml = <<-EOT + + + + + EOT + my_doc = REXML::Document.new(my_xml) + comments = my_doc.root.comments + comments.frozen? # => true + comments.map {|c| c.class } # => [REXML::Comment, REXML::Comment] + comments.map {|c| c.to_s } # => ["foo", "bar"] + +[Processing Instruction Children] + + Use method REXML::Element#instructions to retrieve a frozen array + of processing instruction children: + + my_xml = <<-EOT + + + + + EOT + my_doc = REXML::Document.new(my_xml) + instrs = my_doc.root.instructions + instrs.frozen? # => true + instrs.map {|i| i.class } # => [REXML::Instruction, REXML::Instruction] + instrs.map {|i| i.to_s } # => ["", ""] + +[Text Children] + + Use method REXML::Element#has_text? to retrieve whether the element + has text children: + + doc.root.has_text? # => true + REXML::Element.new('foo').has_text? # => false + + Use method REXML::Element#texts to retrieve a frozen array of text children: + + my_xml = 'textmore' + my_doc = REXML::Document.new(my_xml) + texts = my_doc.root.texts + texts.frozen? # => true + texts.map {|t| t.class } # => [REXML::Text, REXML::Text] + texts.map {|t| t.to_s } # => ["text", "more"] + +[Parenthood] + + Use inherited method REXML::Parent.parent? to retrieve whether the element is a parent; + always returns +true+; only REXML::Child#parent returns +false+. + + doc.root.parent? # => true + +=== Element Attributes + +Use method REXML::Element#has_attributes? to return whether the element +has attributes: + + ele = doc.root # => ... + ele.has_attributes? # => false + ele = ele.elements.first # => ... + ele.has_attributes? # => true + +Use method REXML::Element#attributes to return the hash +containing the attributes for the element. +Each hash key is a string attribute name; +each hash value is an REXML::Attribute object. + + ele = doc.root # => ... + attrs = ele.attributes # => {} + + ele = ele.elements.first # => ... + attrs = ele.attributes # => {"category"=>category='cooking'} + attrs.size # => 1 + attr_name = attrs.keys.first # => "category" + attr_name.class # => String + attr_value = attrs.values.first # => category='cooking' + attr_value.class # => REXML::Attribute + +Use method REXML::Element#[] to retrieve the string value for a given attribute, +which may be given as either a string or a symbol: + + ele = doc.root.elements.first # => ... + attr_value = ele['category'] # => "cooking" + attr_value.class # => String + ele['nosuch'] # => nil + +Use method REXML::Element#attribute to retrieve the value of a named attribute: + + my_xml = "" + my_doc = REXML::Document.new(my_xml) + my_doc.root.attribute("x") # => x='x' + my_doc.root.attribute("x", "a") # => a:x='a:x' + +== Whitespace + +Use method REXML::Element#ignore_whitespace_nodes to determine whether +whitespace nodes were ignored when the XML was parsed; +returns +true+ if so, +nil+ otherwise. + +Use method REXML::Element#whitespace to determine whether whitespace +is respected for the element; returns +true+ if so, +false+ otherwise. + +== Namespaces + +Use method REXML::Element#namespace to retrieve the string namespace URI +for the element, which may derive from one of its ancestors: + + xml_string = <<-EOT + + + + + + + EOT + d = Document.new(xml_string) + b = d.elements['//b'] + b.namespace # => "1" + b.namespace('y') # => "2" + b.namespace('nosuch') # => nil + +Use method REXML::Element#namespaces to retrieve a hash of all defined namespaces +in the element and its ancestors: + + xml_string = <<-EOT + + + + + + + EOT + d = Document.new(xml_string) + d.elements['//a'].namespaces # => {"x"=>"1", "y"=>"2"} + d.elements['//b'].namespaces # => {"x"=>"1", "y"=>"2"} + d.elements['//c'].namespaces # => {"x"=>"1", "y"=>"2", "z"=>"3"} + +Use method REXML::Element#prefixes to retrieve an array of the string prefixes (names) +of all defined namespaces in the element and its ancestors: + + xml_string = <<-EOT + + + + + + + EOT + d = Document.new(xml_string, {compress_whitespace: :all}) + d.elements['//a'].prefixes # => ["x", "y"] + d.elements['//b'].prefixes # => ["x", "y"] + d.elements['//c'].prefixes # => ["x", "y", "z"] + +== Traversing + +You can use certain methods to traverse children of the element. +Each child that meets given criteria is yielded to the given block. + +[Traverse All Children] + + Use inherited method REXML::Parent#each (or its alias #each_child) to traverse + all children of the element: + + doc.root.each {|child| p [child.class, child] } + + Output: + + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + [REXML::Element, ... ] + [REXML::Text, "\n\n"] + +[Traverse Element Children] + + Use method REXML::Element#each_element to traverse only the element children + of the element: + + doc.root.each_element {|e| p [e.class, e] } + + Output: + + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + +[Traverse Element Children with Attribute] + + Use method REXML::Element#each_element_with_attribute with the single argument + +attr_name+ to traverse each element child that has the given attribute: + + my_doc = Document.new '' + my_doc.root.each_element_with_attribute('id') {|e| p [e.class, e] } + + Output: + + [REXML::Element, ] + [REXML::Element, ] + [REXML::Element, ] + + Use the same method with a second argument +value+ to traverse + each element child element that has the given attribute and value: + + my_doc.root.each_element_with_attribute('id', '1') {|e| p [e.class, e] } + + Output: + + [REXML::Element, ] + [REXML::Element, ] + + Use the same method with a third argument +max+ to traverse + no more than the given number of element children: + + my_doc.root.each_element_with_attribute('id', '1', 1) {|e| p [e.class, e] } + + Output: + + [REXML::Element, ] + + Use the same method with a fourth argument +xpath+ to traverse + only those element children that match the given xpath: + + my_doc.root.each_element_with_attribute('id', '1', 2, '//d') {|e| p [e.class, e] } + + Output: + + [REXML::Element, ] + +[Traverse Element Children with Text] + + Use method REXML::Element#each_element_with_text with no arguments + to traverse those element children that have text: + + my_doc = Document.new 'bbd' + my_doc.root.each_element_with_text {|e| p [e.class, e] } + + Output: + + [REXML::Element, ... ] + [REXML::Element, ... ] + [REXML::Element, ... ] + + Use the same method with the single argument +text+ to traverse + those element children that have exactly that text: + + my_doc.root.each_element_with_text('b') {|e| p [e.class, e] } + + Output: + + [REXML::Element, ... ] + [REXML::Element, ... ] + + Use the same method with additional second argument +max+ to traverse + no more than the given number of element children: + + my_doc.root.each_element_with_text('b', 1) {|e| p [e.class, e] } + + Output: + + [REXML::Element, ... ] + + Use the same method with additional third argument +xpath+ to traverse + only those element children that also match the given xpath: + + my_doc.root.each_element_with_text('b', 2, '//c') {|e| p [e.class, e] } + + Output: + + [REXML::Element, ... ] + +[Traverse Element Children's Indexes] + + Use inherited method REXML::Parent#each_index to traverse all children's indexes + (not just those of element children): + + doc.root.each_index {|i| print i } + + Output: + + 012345678 + +[Traverse Children Recursively] + + Use included method REXML::Node#each_recursive to traverse all children recursively: + + doc.root.each_recursive {|child| p [child.class, child] } + + Output: + + [REXML::Element, ... ] + [REXML::Element, ... </>] + [REXML::Element, <author> ... </>] + [REXML::Element, <year> ... </>] + [REXML::Element, <price> ... </>] + [REXML::Element, <book category='children'> ... </>] + [REXML::Element, <title lang='en'> ... </>] + [REXML::Element, <author> ... </>] + [REXML::Element, <year> ... </>] + [REXML::Element, <price> ... </>] + [REXML::Element, <book category='web'> ... </>] + [REXML::Element, <title lang='en'> ... </>] + [REXML::Element, <author> ... </>] + [REXML::Element, <author> ... </>] + [REXML::Element, <author> ... </>] + [REXML::Element, <author> ... </>] + [REXML::Element, <author> ... </>] + [REXML::Element, <year> ... </>] + [REXML::Element, <price> ... </>] + [REXML::Element, <book category='web' cover='paperback'> ... </>] + [REXML::Element, <title lang='en'> ... </>] + [REXML::Element, <author> ... </>] + [REXML::Element, <year> ... </>] + [REXML::Element, <price> ... </>] + +== Searching + +You can use certain methods to search among the descendants of an element. + +Use method REXML::Element#get_elements to retrieve all element children of the element +that match the given +xpath+: + + xml_string = <<-EOT + <root> + <a level='1'> + <a level='2'/> + </a> + </root> + EOT + d = Document.new(xml_string) + d.root.get_elements('//a') # => [<a level='1'> ... </>, <a level='2'/>] + +Use method REXML::Element#get_text with no argument to retrieve the first text node +in the first child: + + my_doc = Document.new "<p>some text <b>this is bold!</b> more text</p>" + text_node = my_doc.root.get_text + text_node.class # => REXML::Text + text_node.to_s # => "some text " + +Use the same method with argument +xpath+ to retrieve the first text node +in the first child that matches the xpath: + + my_doc.root.get_text(1) # => "this is bold!" + +Use method REXML::Element#text with no argument to retrieve the text +from the first text node in the first child: + + my_doc = Document.new "<p>some text <b>this is bold!</b> more text</p>" + text_node = my_doc.root.text + text_node.class # => String + text_node # => "some text " + +Use the same method with argument +xpath+ to retrieve the text from the first text node +in the first child that matches the xpath: + + my_doc.root.text(1) # => "this is bold!" + +Use included method REXML::Node#find_first_recursive +to retrieve the first descendant element +for which the given block returns a truthy value, or +nil+ if none: + + doc.root.find_first_recursive do |ele| + ele.name == 'price' + end # => <price> ... </> + doc.root.find_first_recursive do |ele| + ele.name == 'nosuch' + end # => nil + +== Editing + +=== Editing a Document + +[Creating a Document] + + Create a new document with method REXML::Document::new: + + doc = Document.new(source_string) + empty_doc = REXML::Document.new + +[Adding to the Document] + + Add an XML declaration with method REXML::Document#add + and an argument of type REXML::XMLDecl: + + my_doc = Document.new + my_doc.xml_decl.to_s # => "" + my_doc.add(XMLDecl.new('2.0')) + my_doc.xml_decl.to_s # => "<?xml version='2.0'?>" + + Add a document type with method REXML::Document#add + and an argument of type REXML::DocType: + + my_doc = Document.new + my_doc.doctype.to_s # => "" + my_doc.add(DocType.new('foo')) + my_doc.doctype.to_s # => "<!DOCTYPE foo>" + + Add a node of any other REXML type with method REXML::Document#add and an argument + that is not of type REXML::XMLDecl or REXML::DocType: + + my_doc = Document.new + my_doc.add(Element.new('foo')) + my_doc.to_s # => "<foo/>" + + Add an existing element as the root element with method REXML::Document#add_element: + + ele = Element.new('foo') + my_doc = Document.new + my_doc.add_element(ele) + my_doc.root # => <foo/> + + Create and add an element as the root element with method REXML::Document#add_element: + + my_doc = Document.new + my_doc.add_element('foo') + my_doc.root # => <foo/> + +=== Editing an Element + +==== Creating an Element + +Create a new element with method REXML::Element::new: + + ele = Element.new('foo') # => <foo/> + +==== Setting Element Properties + +Set the context for an element with method REXML::Element#context= +(see {Element Context}[../context_rdoc.html]): + + ele.context # => nil + ele.context = {ignore_whitespace_nodes: :all} + ele.context # => {:ignore_whitespace_nodes=>:all} + +Set the parent for an element with inherited method REXML::Child#parent= + + ele.parent # => nil + ele.parent = Element.new('bar') + ele.parent # => <bar/> + +Set the text for an element with method REXML::Element#text=: + + ele.text # => nil + ele.text = 'bar' + ele.text # => "bar" + +==== Adding to an Element + +Add a node as the last child with inherited method REXML::Parent#add (or its alias #push): + + ele = Element.new('foo') # => <foo/> + ele.push(Text.new('bar')) + ele.push(Element.new('baz')) + ele.children # => ["bar", <baz/>] + +Add a node as the first child with inherited method REXML::Parent#unshift: + + ele = Element.new('foo') # => <foo/> + ele.unshift(Element.new('bar')) + ele.unshift(Text.new('baz')) + ele.children # => ["bar", <baz/>] + +Add an element as the last child with method REXML::Element#add_element: + + ele = Element.new('foo') # => <foo/> + ele.add_element('bar') + ele.add_element(Element.new('baz')) + ele.children # => [<bar/>, <baz/>] + +Add a text node as the last child with method REXML::Element#add_text: + + ele = Element.new('foo') # => <foo/> + ele.add_text('bar') + ele.add_text(Text.new('baz')) + ele.children # => ["bar", "baz"] + +Insert a node before a given node with method REXML::Parent#insert_before: + + ele = Element.new('foo') # => <foo/> + ele.add_text('bar') + ele.add_text(Text.new('baz')) + ele.children # => ["bar", "baz"] + target = ele[1] # => "baz" + ele.insert_before(target, Text.new('bat')) + ele.children # => ["bar", "bat", "baz"] + +Insert a node after a given node with method REXML::Parent#insert_after: + + ele = Element.new('foo') # => <foo/> + ele.add_text('bar') + ele.add_text(Text.new('baz')) + ele.children # => ["bar", "baz"] + target = ele[0] # => "bar" + ele.insert_after(target, Text.new('bat')) + ele.children # => ["bar", "bat", "baz"] + +Add an attribute with method REXML::Element#add_attribute: + + ele = Element.new('foo') # => <foo/> + ele.add_attribute('bar', 'baz') + ele.add_attribute(Attribute.new('bat', 'bam')) + ele.attributes # => {"bar"=>bar='baz', "bat"=>bat='bam'} + +Add multiple attributes with method REXML::Element#add_attributes: + + ele = Element.new('foo') # => <foo/> + ele.add_attributes({'bar' => 'baz', 'bat' => 'bam'}) + ele.add_attributes([['ban', 'bap'], ['bah', 'bad']]) + ele.attributes # => {"bar"=>bar='baz', "bat"=>bat='bam', "ban"=>ban='bap', "bah"=>bah='bad'} + +Add a namespace with method REXML::Element#add_namespace: + + ele = Element.new('foo') # => <foo/> + ele.add_namespace('bar') + ele.add_namespace('baz', 'bat') + ele.namespaces # => {"xmlns"=>"bar", "baz"=>"bat"} + +==== Deleting from an Element + +Delete a specific child object with inherited method REXML::Parent#delete: + + ele = Element.new('foo') # => <foo/> + ele.add_element('bar') + ele.add_text('baz') + ele.children # => [<bar/>, "baz"] + target = ele[1] # => "baz" + ele.delete(target) # => "baz" + ele.children # => [<bar/>] + target = ele[0] # => <baz/> + ele.delete(target) # => <baz/> + ele.children # => [] + +Delete a child at a specific index with inherited method REXML::Parent#delete_at: + + ele = Element.new('foo') # => <foo/> + ele.add_element('bar') + ele.add_text('baz') + ele.children # => [<bar/>, "baz"] + ele.delete_at(1) + ele.children # => [<bar/>] + ele.delete_at(0) + ele.children # => [] + +Delete all children meeting a specified criterion with inherited method +REXML::Parent#delete_if: + + ele = Element.new('foo') # => <foo/> + ele.add_element('bar') + ele.add_text('baz') + ele.add_element('bat') + ele.add_text('bam') + ele.children # => [<bar/>, "baz", <bat/>, "bam"] + ele.delete_if {|child| child.instance_of?(Text) } + ele.children # => [<bar/>, <bat/>] + +Delete an element at a specific 1-based index with method REXML::Element#delete_element: + + ele = Element.new('foo') # => <foo/> + ele.add_element('bar') + ele.add_text('baz') + ele.add_element('bat') + ele.add_text('bam') + ele.children # => [<bar/>, "baz", <bat/>, "bam"] + ele.delete_element(2) # => <bat/> + ele.children # => [<bar/>, "baz", "bam"] + ele.delete_element(1) # => <bar/> + ele.children # => ["baz", "bam"] + +Delete a specific element with the same method: + + ele = Element.new('foo') # => <foo/> + ele.add_element('bar') + ele.add_text('baz') + ele.add_element('bat') + ele.add_text('bam') + ele.children # => [<bar/>, "baz", <bat/>, "bam"] + target = ele.elements[2] # => <bat/> + ele.delete_element(target) # => <bat/> + ele.children # => [<bar/>, "baz", "bam"] + +Delete an element matching an xpath using the same method: + + ele = Element.new('foo') # => <foo/> + ele.add_element('bar') + ele.add_text('baz') + ele.add_element('bat') + ele.add_text('bam') + ele.children # => [<bar/>, "baz", <bat/>, "bam"] + ele.delete_element('./bat') # => <bat/> + ele.children # => [<bar/>, "baz", "bam"] + ele.delete_element('./bar') # => <bar/> + ele.children # => ["baz", "bam"] + +Delete an attribute by name with method REXML::Element#delete_attribute: + + ele = Element.new('foo') # => <foo/> + ele.add_attributes({'bar' => 'baz', 'bam' => 'bat'}) + ele.attributes # => {"bar"=>bar='baz', "bam"=>bam='bat'} + ele.delete_attribute('bam') + ele.attributes # => {"bar"=>bar='baz'} + +Delete a namespace with method REXML::Element#delete_namespace: + + ele = Element.new('foo') # => <foo/> + ele.add_namespace('bar') + ele.add_namespace('baz', 'bat') + ele.namespaces # => {"xmlns"=>"bar", "baz"=>"bat"} + ele.delete_namespace('xmlns') + ele.namespaces # => {} # => {"baz"=>"bat"} + ele.delete_namespace('baz') + ele.namespaces # => {} # => {} + +Remove an element from its parent with inherited method REXML::Child#remove: + + ele = Element.new('foo') # => <foo/> + parent = Element.new('bar') # => <bar/> + parent.add_element(ele) # => <foo/> + parent.children.size # => 1 + ele.remove # => <foo/> + parent.children.size # => 0 + +==== Replacing Nodes + +Replace the node at a given 0-based index with inherited method REXML::Parent#[]=: + + ele = Element.new('foo') # => <foo/> + ele.add_element('bar') + ele.add_text('baz') + ele.add_element('bat') + ele.add_text('bam') + ele.children # => [<bar/>, "baz", <bat/>, "bam"] + ele[2] = Text.new('bad') # => "bad" + ele.children # => [<bar/>, "baz", "bad", "bam"] + +Replace a given node with another node with inherited method REXML::Parent#replace_child: + + ele = Element.new('foo') # => <foo/> + ele.add_element('bar') + ele.add_text('baz') + ele.add_element('bat') + ele.add_text('bam') + ele.children # => [<bar/>, "baz", <bat/>, "bam"] + target = ele[2] # => <bat/> + ele.replace_child(target, Text.new('bah')) + ele.children # => [<bar/>, "baz", "bah", "bam"] + +Replace +self+ with a given node with inherited method REXML::Child#replace_with: + + ele = Element.new('foo') # => <foo/> + ele.add_element('bar') + ele.add_text('baz') + ele.add_element('bat') + ele.add_text('bam') + ele.children # => [<bar/>, "baz", <bat/>, "bam"] + target = ele[2] # => <bat/> + target.replace_with(Text.new('bah')) + ele.children # => [<bar/>, "baz", "bah", "bam"] + +=== Cloning + +Create a shallow clone of an element with method REXML::Element#clone. +The clone contains the name and attributes, but not the parent or children: + + ele = Element.new('foo') + ele.add_attributes({'bar' => 0, 'baz' => 1}) + ele.clone # => <foo bar='0' baz='1'/> + +Create a shallow clone of a document with method REXML::Document#clone. +The XML declaration is copied; the document type and root element are not cloned: + + my_xml = '<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE foo><root/>' + my_doc = Document.new(my_xml) + clone_doc = my_doc.clone + + my_doc.xml_decl # => <?xml ... ?> + clone_doc.xml_decl # => <?xml ... ?> + + my_doc.doctype.to_s # => "<?xml version='1.0' encoding='UTF-8'?>" + clone_doc.doctype.to_s # => "" + + my_doc.root # => <root/> + clone_doc.root # => nil + +Create a deep clone of an element with inherited method REXML::Parent#deep_clone. +All nodes and attributes are copied: + + doc.to_s.size # => 825 + clone = doc.deep_clone + clone.to_s.size # => 825 + +== Writing the Document + +Write a document to an \IO stream (defaults to <tt>$stdout</tt>) +with method REXML::Document#write: + + doc.write + +Output: + + <?xml version='1.0' encoding='UTF-8'?> + <bookstore> + + <book category='cooking'> + <title lang='en'>Everyday Italian + Giada De Laurentiis + 2005 + 30.00 + + + + Harry Potter + J K. Rowling + 2005 + 29.99 + + + + XQuery Kick Start + James McGovern + Per Bothner + Kurt Cagle + James Linn + Vaidyanathan Nagarajan + 2003 + 49.99 + + + + Learning XML + Erik T. Ray + 2003 + 39.95 + + + diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 8933a013..11893a95 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require_relative "namespace" require_relative 'text' @@ -13,9 +13,6 @@ class Attribute # The element to which this attribute belongs attr_reader :element - # The normalized value of this attribute. That is, the attribute with - # entities intact. - attr_writer :normalized PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um @@ -122,10 +119,13 @@ def hash # b = Attribute.new( "ns:x", "y" ) # b.to_string # -> "ns:x='y'" def to_string + value = to_s if @element and @element.context and @element.context[:attribute_quote] == :quote - %Q^#@expanded_name="#{to_s().gsub(/"/, '"')}"^ + value = value.gsub('"', '"') if value.include?('"') + %Q^#@expanded_name="#{value}"^ else - "#@expanded_name='#{to_s().gsub(/'/, ''')}'" + value = value.gsub("'", ''') if value.include?("'") + "#@expanded_name='#{value}'" end end @@ -141,7 +141,6 @@ def to_s return @normalized if @normalized @normalized = Text::normalize( @unnormalized, doctype ) - @unnormalized = nil @normalized end @@ -150,10 +149,16 @@ def to_s def value return @unnormalized if @unnormalized @unnormalized = Text::unnormalize( @normalized, doctype ) - @normalized = nil @unnormalized end + # The normalized value of this attribute. That is, the attribute with + # entities intact. + def normalized=(new_normalized) + @normalized = new_normalized + @unnormalized = nil + end + # Returns a copy of this attribute def clone Attribute.new self @@ -190,7 +195,7 @@ def node_type end def inspect - rv = "" + rv = +"" write( rv ) rv end diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 2edeb987..b1caa020 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -69,7 +69,7 @@ class Document < Element # d.to_s # => "FooBar" # # When argument +document+ is given, it must be an existing - # document object, whose context and attributes (but not chidren) + # document object, whose context and attributes (but not children) # are cloned into the new document: # # d = REXML::Document.new(xml_string) diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 4c21dbd5..a5808d7c 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -7,14 +7,6 @@ require_relative "parseexception" module REXML - # An implementation note about namespaces: - # As we parse, when we find namespaces we put them in a hash and assign - # them a unique ID. We then convert the namespace prefix for the node - # to the unique ID. This makes namespace lookup much faster for the - # cost of extra memory use. We save the namespace prefix for the - # context node and convert it back when we write it. - @@namespaces = {} - # An \REXML::Element object represents an XML element. # # An element: @@ -989,7 +981,7 @@ def previous_element # :call-seq: # has_text? -> true or false # - # Returns +true if the element has one or more text noded, + # Returns +true+ if the element has one or more text noded, # +false+ otherwise: # # d = REXML::Document.new 'text' @@ -1006,7 +998,7 @@ def has_text? # text(xpath = nil) -> text_string or nil # # Returns the text string from the first text node child - # in a specified element, if it exists, # +nil+ otherwise. + # in a specified element, if it exists, +nil+ otherwise. # # With no argument, returns the text from the first text node in +self+: # @@ -1014,7 +1006,7 @@ def has_text? # d.root.text.class # => String # d.root.text # => "some text " # - # With argument +xpath+, returns text from the the first text node + # With argument +xpath+, returns text from the first text node # in the element that matches +xpath+: # # d.root.text(1) # => "this is bold!" @@ -1284,16 +1276,11 @@ def [](name_or_index) # document.root.attribute("x", "a") # => a:x='a:x' # def attribute( name, namespace=nil ) - prefix = nil - if namespaces.respond_to? :key - prefix = namespaces.key(namespace) if namespace - else - prefix = namespaces.index(namespace) if namespace - end + prefix = namespaces.key(namespace) if namespace prefix = nil if prefix == 'xmlns' ret_val = - attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) + attributes.get_attribute( prefix ? "#{prefix}:#{name}" : name ) return ret_val unless ret_val.nil? return nil if prefix.nil? diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb index 89a9e84c..573db691 100644 --- a/lib/rexml/entity.rb +++ b/lib/rexml/entity.rb @@ -132,24 +132,34 @@ def to_s # then: # doctype.entity('yada').value #-> "nanoo bar nanoo" def value - if @value - matches = @value.scan(PEREFERENCE_RE) - rv = @value.clone - if @parent - sum = 0 - matches.each do |entity_reference| - entity_value = @parent.entity( entity_reference[0] ) - if sum + entity_value.bytesize > Security.entity_expansion_text_limit - raise "entity expansion has grown too large" - else - sum += entity_value.bytesize - end - rv.gsub!( /%#{entity_reference.join};/um, entity_value ) + @resolved_value ||= resolve_value + end + + def parent=(other) + @resolved_value = nil + super + end + + private + def resolve_value + return nil if @value.nil? + return @value unless @value.match?(PEREFERENCE_RE) + + matches = @value.scan(PEREFERENCE_RE) + rv = @value.clone + if @parent + sum = 0 + matches.each do |entity_reference| + entity_value = @parent.entity( entity_reference[0] ) + if sum + entity_value.bytesize > Security.entity_expansion_text_limit + raise "entity expansion has grown too large" + else + sum += entity_value.bytesize end + rv.gsub!( /%#{entity_reference.join};/um, entity_value ) end - return rv end - nil + rv end end diff --git a/lib/rexml/formatters/pretty.rb b/lib/rexml/formatters/pretty.rb index 562ef946..a838d835 100644 --- a/lib/rexml/formatters/pretty.rb +++ b/lib/rexml/formatters/pretty.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require_relative 'default' module REXML @@ -58,7 +58,7 @@ def write_element(node, output) skip = false if compact if node.children.inject(true) {|s,c| s & c.kind_of?(Text)} - string = "" + string = +"" old_level = @level @level = 0 node.children.each { |child| write( child, string ) } @@ -111,7 +111,7 @@ def write_document( node, output ) # itself, then we don't need a carriage return... which makes this # logic more complex. node.children.each { |child| - next if child == node.children[-1] and child.instance_of?(Text) + next if child.instance_of?(Text) unless child == node.children[0] or child.instance_of?(Text) or (child == node.children[1] and !node.children[0].writethis) output << "\n" diff --git a/lib/rexml/functions.rb b/lib/rexml/functions.rb index 77926bf2..4c114616 100644 --- a/lib/rexml/functions.rb +++ b/lib/rexml/functions.rb @@ -262,11 +262,10 @@ def Functions::string_length( string ) string(string).length end - # UNTESTED def Functions::normalize_space( string=nil ) string = string(@@context[:node]) if string.nil? if string.kind_of? Array - string.collect{|x| string.to_s.strip.gsub(/\s+/um, ' ') if string} + string.collect{|x| x.to_s.strip.gsub(/\s+/um, ' ') if x} else string.to_s.strip.gsub(/\s+/um, ' ') end diff --git a/lib/rexml/namespace.rb b/lib/rexml/namespace.rb index 924edf95..2e67252a 100644 --- a/lib/rexml/namespace.rb +++ b/lib/rexml/namespace.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require_relative 'xmltokens' @@ -10,13 +10,17 @@ module Namespace # The expanded name of the object, valid if name is set attr_accessor :prefix include XMLTokens + NAME_WITHOUT_NAMESPACE = /\A#{NCNAME_STR}\z/ NAMESPLIT = /^(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})/u # Sets the name and the expanded name def name=( name ) @expanded_name = name - case name - when NAMESPLIT + if name.match?(NAME_WITHOUT_NAMESPACE) + @prefix = "" + @namespace = "" + @name = name + elsif name =~ NAMESPLIT if $1 @prefix = $1 else @@ -24,7 +28,7 @@ def name=( name ) @namespace = "" end @name = $2 - when "" + elsif name == "" @prefix = nil @namespace = nil @name = nil diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index 081caba6..c771db70 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -52,10 +52,14 @@ def parent? # Visit all subnodes of +self+ recursively def each_recursive(&block) # :yields: node - self.elements.each {|node| - block.call(node) - node.each_recursive(&block) - } + stack = [] + each { |child| stack.unshift child if child.node_type == :element } + until stack.empty? + child = stack.pop + yield child + n = stack.size + child.each { |grandchild| stack.insert n, grandchild if grandchild.node_type == :element } + end end # Find (and return) first subnode (recursively) for which the block diff --git a/lib/rexml/parseexception.rb b/lib/rexml/parseexception.rb index 7b16cd1a..e57d05fd 100644 --- a/lib/rexml/parseexception.rb +++ b/lib/rexml/parseexception.rb @@ -29,6 +29,7 @@ def to_s err << "\nLine: #{line}\n" err << "Position: #{position}\n" err << "Last 80 unconsumed characters:\n" + err.force_encoding("ASCII-8BIT") err << @source.buffer[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ') end diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 305b1207..44dc6580 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require_relative '../parseexception' require_relative '../undefinednamespaceexception' require_relative '../source' @@ -7,6 +7,17 @@ module REXML module Parsers + if StringScanner::Version < "3.0.8" + module StringScannerCaptures + refine StringScanner do + def captures + values_at(*(1...size)) + end + end + end + using StringScannerCaptures + end + # = Using the Pull Parser # This API is experimental, and subject to change. # parser = PullParser.new( "texttxet" ) @@ -96,7 +107,7 @@ class BaseParser ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" PEDECL = "" GEDECL = "" - ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um NOTATIONDECL_START = /\A\s* [/'/, "'", "'", /'/] } + module Private + TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um + CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um + ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um + NAME_PATTERN = /#{NAME}/um + GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" + PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" + ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um + CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/ + CHARACTER_REFERENCES = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + DEFAULT_ENTITIES_PATTERNS = {} + default_entities = ['gt', 'lt', 'quot', 'apos', 'amp'] + default_entities.each do |term| + DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/ + end + end + private_constant :Private + def initialize( source ) self.stream = source @listeners = [] + @prefixes = Set.new + @entity_expansion_count = 0 end def add_listener( listener ) @@ -122,10 +153,12 @@ def add_listener( listener ) end attr_reader :source + attr_reader :entity_expansion_count def stream=( source ) @source = SourceFactory.create_from( source ) @closed = nil + @have_root = false @document_status = nil @tags = [] @stack = [] @@ -180,6 +213,8 @@ def peek depth=0 # Returns the next event. This is a +PullEvent+ object. def pull + @source.drop_parsed_content + pull_event.tap do |event| @listeners.each do |listener| listener.receive event @@ -192,236 +227,269 @@ def pull_event x, @closed = @closed, nil return [ :end_element, x ] end - return [ :end_document ] if empty? + if empty? + if @document_status == :in_doctype + raise ParseException.new("Malformed DOCTYPE: unclosed", @source) + end + return [ :end_document ] + end return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" + + @source.ensure_buffer if @document_status == nil - word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um ) - word = word[1] unless word.nil? - #STDERR.puts "WORD = #{word.inspect}" - case word - when COMMENT_START - return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] - when XMLDECL_START - #STDERR.puts "XMLDECL" - results = @source.match( XMLDECL_PATTERN, true )[1] - version = VERSION.match( results ) - version = version[1] unless version.nil? - encoding = ENCODING.match(results) - encoding = encoding[1] unless encoding.nil? - if need_source_encoding_update?(encoding) - @source.encoding = encoding - end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" - end - standalone = STANDALONE.match(results) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone ] - when INSTRUCTION_START + start_position = @source.position + if @source.match("/um, true) - id = [nil, nil, nil] - @document_status = :after_doctype - else - id = parse_id(base_error_message, - accept_external_id: true, - accept_public_id: false) - if id[0] == "SYSTEM" - # For backward compatibility - id[1], id[2] = id[2], nil + elsif @source.match("/um, true) + if md.nil? + raise REXML::ParseException.new("Unclosed comment", @source) end - if @source.match(/\A\s*\[/um, true) + if /--|-\z/.match?(md[1]) + raise REXML::ParseException.new("Malformed comment", @source) + end + return [ :comment, md[1] ] + elsif @source.match("DOCTYPE", true) + base_error_message = "Malformed DOCTYPE" + unless @source.match(/\s+/um, true) + if @source.match(">") + message = "#{base_error_message}: name is missing" + else + message = "#{base_error_message}: invalid name" + end + @source.position = start_position + raise REXML::ParseException.new(message, @source) + end + @nsstack.unshift(Set.new) + name = parse_name(base_error_message) + if @source.match(/\s*\[/um, true) + id = [nil, nil, nil] @document_status = :in_doctype - elsif @source.match(/\A\s*>/um, true) + elsif @source.match(/\s*>/um, true) + id = [nil, nil, nil] @document_status = :after_doctype + @source.ensure_buffer else - message = "#{base_error_message}: garbage after external ID" - raise REXML::ParseException.new(message, @source) + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: false) + if id[0] == "SYSTEM" + # For backward compatibility + id[1], id[2] = id[2], nil + end + if @source.match(/\s*\[/um, true) + @document_status = :in_doctype + elsif @source.match(/\s*>/um, true) + @document_status = :after_doctype + @source.ensure_buffer + else + message = "#{base_error_message}: garbage after external ID" + raise REXML::ParseException.new(message, @source) + end end - end - args = [:start_doctype, name, *id] - if @document_status == :after_doctype - @source.match(/\A\s*/um, true) - @stack << [ :end_doctype ] - end - return args - when /\A\s+/ - else - @document_status = :after_doctype - if @source.encoding == "UTF-8" - @source.buffer.force_encoding(::Encoding::UTF_8) + args = [:start_doctype, name, *id] + if @document_status == :after_doctype + @source.match(/\s*/um, true) + @stack << [ :end_doctype ] + end + return args + else + message = "Invalid XML" + raise REXML::ParseException.new(message, @source) end end end if @document_status == :in_doctype - md = @source.match(/\A\s*(.*?>)/um) - case md[1] - when SYSTEMENTITY - match = @source.match( SYSTEMENTITY, true )[1] - return [ :externalentity, match ] - - when ELEMENTDECL_START - return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] - - when ENTITY_START - match = @source.match( ENTITYDECL, true ).to_a.compact - match[0] = :entitydecl - ref = false - if match[1] == '%' - ref = true - match.delete_at 1 - end - # Now we have to sort out what kind of entity reference this is - if match[2] == 'SYSTEM' - # External reference - match[3] = match[3][1..-2] # PUBID - match.delete_at(4) if match.size > 4 # Chop out NDATA decl - # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] - elsif match[2] == 'PUBLIC' - # External reference - match[3] = match[3][1..-2] # PUBID - match[4] = match[4][1..-2] # HREF - match.delete_at(5) if match.size > 5 # Chop out NDATA decl - # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] - else - match[2] = match[2][1..-2] - match.pop if match.size == 4 - # match is [ :entity, name, value ] - end - match << '%' if ref - return match - when ATTLISTDECL_START - md = @source.match( ATTLISTDECL_PATTERN, true ) - raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? - element = md[1] - contents = md[0] - - pairs = {} - values = md[0].scan( ATTDEF_RE ) - values.each do |attdef| - unless attdef[3] == "#IMPLIED" - attdef.compact! - val = attdef[3] - val = attdef[4] if val == "#FIXED " - pairs[attdef[0]] = val - if attdef[0] =~ /^xmlns:(.*)/ - @nsstack[0] << $1 - end + @source.match(/\s*/um, true) # skip spaces + start_position = @source.position + if @source.match("/um, true) + raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? + return [ :elementdecl, "/um) - message = "#{base_error_message}: name is missing" + match = [:entitydecl, *match_data.captures.compact] + ref = false + if match[1] == '%' + ref = true + match.delete_at 1 + end + # Now we have to sort out what kind of entity reference this is + if match[2] == 'SYSTEM' + # External reference + match[3] = match[3][1..-2] # PUBID + match.delete_at(4) if match.size > 4 # Chop out NDATA decl + # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] + elsif match[2] == 'PUBLIC' + # External reference + match[3] = match[3][1..-2] # PUBID + match[4] = match[4][1..-2] # HREF + match.delete_at(5) if match.size > 5 # Chop out NDATA decl + # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] else - message = "#{base_error_message}: invalid declaration name" + match[2] = match[2][1..-2] + match.pop if match.size == 4 + # match is [ :entity, name, value ] end - raise REXML::ParseException.new(message, @source) - end - name = parse_name(base_error_message) - id = parse_id(base_error_message, - accept_external_id: true, - accept_public_id: true) - unless @source.match(/\A\s*>/um, true) - message = "#{base_error_message}: garbage before end >" - raise REXML::ParseException.new(message, @source) + match << '%' if ref + return match + elsif @source.match("ATTLIST", true) + md = @source.match(Private::ATTLISTDECL_END, true) + raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? + element = md[1] + contents = md[0] + + pairs = {} + values = md[0].strip.scan( ATTDEF_RE ) + values.each do |attdef| + unless attdef[3] == "#IMPLIED" + attdef.compact! + val = attdef[3] + val = attdef[4] if val == "#FIXED " + pairs[attdef[0]] = val + if attdef[0] =~ /^xmlns:(.*)/ + @nsstack[0] << $1 + end + end + end + return [ :attlistdecl, element, pairs, contents ] + elsif @source.match("NOTATION", true) + base_error_message = "Malformed notation declaration" + unless @source.match(/\s+/um, true) + if @source.match(">") + message = "#{base_error_message}: name is missing" + else + message = "#{base_error_message}: invalid name" + end + @source.position = start_position + raise REXML::ParseException.new(message, @source) + end + name = parse_name(base_error_message) + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: true) + unless @source.match(/\s*>/um, true) + message = "#{base_error_message}: garbage before end >" + raise REXML::ParseException.new(message, @source) + end + return [:notationdecl, name, *id] + elsif md = @source.match(/--(.*?)-->/um, true) + case md[1] + when /--/, /-\z/ + raise REXML::ParseException.new("Malformed comment", @source) + end + return [ :comment, md[1] ] if md end - return [:notationdecl, name, *id] - when DOCTYPE_END + elsif match = @source.match(/(%.*?;)\s*/um, true) + return [ :externalentity, match[1] ] + elsif @source.match(/\]\s*>/um, true) @document_status = :after_doctype - @source.match( DOCTYPE_END, true ) return [ :end_doctype ] end + if @document_status == :in_doctype + raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source) + end end if @document_status == :after_doctype - @source.match(/\A\s*/um, true) + @source.match(/\s*/um, true) end begin - @source.read if @source.buffer.size<2 - if @source.buffer[0] == ?< - if @source.buffer[1] == ?/ + start_position = @source.position + if @source.match("<", true) + # :text's read_until may remain only "<" in buffer. In the + # case, buffer is empty here. So we need to fill buffer + # here explicitly. + @source.ensure_buffer + if @source.match("/", true) @nsstack.shift last_tag = @tags.pop - md = @source.match( CLOSE_MATCH, true ) + md = @source.match(Private::CLOSE_PATTERN, true) if md and !last_tag message = "Unexpected top-level end tag (got '#{md[1]}')" raise REXML::ParseException.new(message, @source) end if md.nil? or last_tag != md[1] message = "Missing end tag for '#{last_tag}'" - message << " (got '#{md[1]}')" if md + message += " (got '#{md[1]}')" if md + @source.position = start_position if md.nil? raise REXML::ParseException.new(message, @source) end return [ :end_element, last_tag ] - elsif @source.buffer[1] == ?! - md = @source.match(/\A(\s*[^>]*>)/um) + elsif @source.match("!", true) + md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md - if md[0][2] == ?- - md = @source.match( COMMENT_PATTERN, true ) + if md[0][0] == ?- + md = @source.match(/--(.*?)-->/um, true) - case md[1] - when /--/, /-\z/ + if md.nil? || /--|-\z/.match?(md[1]) raise REXML::ParseException.new("Malformed comment", @source) end - return [ :comment, md[1] ] if md + return [ :comment, md[1] ] else - md = @source.match( CDATA_PATTERN, true ) + md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) - elsif @source.buffer[1] == ?? + elsif @source.match("?", true) return process_instruction else # Get the next tag - md = @source.match(TAG_MATCH, true) + md = @source.match(Private::TAG_PATTERN, true) unless md + @source.position = start_position raise REXML::ParseException.new("malformed XML: missing tag start", @source) end + tag = md[1] @document_status = :in_element - prefixes = Set.new - prefixes << md[2] if md[2] + @prefixes.clear + @prefixes << md[2] if md[2] @nsstack.unshift(curr_ns=Set.new) - attributes, closed = parse_attributes(prefixes, curr_ns) + attributes, closed = parse_attributes(@prefixes, curr_ns) # Verify that all of the prefixes have been defined - for prefix in prefixes + for prefix in @prefixes unless @nsstack.find{|k| k.member?(prefix)} raise UndefinedNamespaceException.new(prefix,@source,self) end end if closed - @closed = md[1] + @closed = tag @nsstack.shift else - @tags.push( md[1] ) + if @tags.empty? and @have_root + raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source) + end + @tags.push( tag ) end - return [ :start_element, md[1], attributes ] + @have_root = true + return [ :start_element, tag, attributes ] end else - md = @source.match( TEXT_PATTERN, true ) - if md[0].length == 0 - @source.match( /(\s+)/, true ) + text = @source.read_until("<") + if text.chomp!("<") + @source.position -= "<".bytesize + end + if @tags.empty? + unless /\A\s*\z/.match?(text) + if @have_root + raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) + else + raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source) + end + end + return pull_event if @have_root end - #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 - #return [ :text, "" ] if md[0].length == 0 - # unnormalized = Text::unnormalize( md[1], self ) - # return PullEvent.new( :text, md[1], unnormalized ) - return [ :text, md[1] ] + return [ :text, text ] end rescue REXML::UndefinedNamespaceException raise @@ -438,7 +506,9 @@ def pull_event def entity( reference, entities ) value = nil value = entities[ reference ] if entities - if not value + if value + record_entity_expansion + else value = DEFAULT_ENTITIES[ reference ] value = value[2] if value end @@ -463,35 +533,51 @@ def normalize( input, entities=nil, entity_filter=nil ) # Unescapes all possible entities def unnormalize( string, entities=nil, filter=nil ) - rv = string.clone - rv.gsub!( /\r\n?/, "\n" ) + if string.include?("\r") + rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) + else + rv = string.dup + end matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 - rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { + rv.gsub!( Private::CHARACTER_REFERENCES ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') } matches.collect!{|x|x[0]}.compact! if matches.size > 0 + sum = 0 matches.each do |entity_reference| unless filter and filter.include?(entity_reference) entity_value = entity( entity_reference, entities ) if entity_value - re = /&#{entity_reference};/ + re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) + sum += rv.bytesize + if sum > Security.entity_expansion_text_limit + raise "entity expansion has grown too large" + end else er = DEFAULT_ENTITIES[entity_reference] rv.gsub!( er[0], er[2] ) if er end end end - rv.gsub!( /&/, '&' ) + rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' ) end rv end private + + def record_entity_expansion + @entity_expansion_count += 1 + if @entity_expansion_count > Security.entity_expansion_limit + raise "number of entity expansions exceeded, processing aborted." + end + end + def need_source_encoding_update?(xml_declaration_encoding) return false if xml_declaration_encoding.nil? return false if /\AUTF-16\z/i =~ xml_declaration_encoding @@ -499,16 +585,16 @@ def need_source_encoding_update?(xml_declaration_encoding) end def parse_name(base_error_message) - md = @source.match(/\A\s*#{NAME}/um, true) + md = @source.match(Private::NAME_PATTERN, true) unless md - if @source.match(/\A\s*\S/um) + if @source.match(/\S/um) message = "#{base_error_message}: invalid name" else message = "#{base_error_message}: name is missing" end raise REXML::ParseException.new(message, @source) end - md[1] + md[0] end def parse_id(base_error_message, @@ -578,96 +664,99 @@ def parse_id_invalid_details(accept_external_id:, end def process_instruction - match_data = @source.match(INSTRUCTION_PATTERN, true) - unless match_data - message = "Invalid processing instruction node" - raise REXML::ParseException.new(message, @source) + name = parse_name("Malformed XML: Invalid processing instruction node") + if @source.match(/\s+/um, true) + match_data = @source.match(/(.*?)\?>/um, true) + unless match_data + raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + end + content = match_data[1] + else + content = nil + unless @source.match("?>", true) + raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) + end + end + if name == "xml" + if @document_status + raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) + end + version = VERSION.match(content) + version = version[1] unless version.nil? + encoding = ENCODING.match(content) + encoding = encoding[1] unless encoding.nil? + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding + encoding = "UTF-16" + end + standalone = STANDALONE.match(content) + standalone = standalone[1] unless standalone.nil? + return [ :xmldecl, version, encoding, standalone ] end - [:processing_instruction, match_data[1], match_data[2]] + [:processing_instruction, name, content] end def parse_attributes(prefixes, curr_ns) attributes = {} closed = false - match_data = @source.match(/^(.*?)(\/)?>/um, true) - if match_data.nil? - message = "Start tag isn't ended" - raise REXML::ParseException.new(message, @source) - end - - raw_attributes = match_data[1] - closed = !match_data[2].nil? - return attributes, closed if raw_attributes.nil? - return attributes, closed if raw_attributes.empty? - - scanner = StringScanner.new(raw_attributes) - until scanner.eos? - if scanner.scan(/\s+/) - break if scanner.eos? - end - - pos = scanner.pos - loop do - break if scanner.scan(ATTRIBUTE_PATTERN) - unless scanner.scan(QNAME) - message = "Invalid attribute name: <#{scanner.rest}>" - raise REXML::ParseException.new(message, @source) - end - name = scanner[0] - unless scanner.scan(/\s*=\s*/um) + while true + if @source.match(">", true) + return attributes, closed + elsif @source.match("/>", true) + closed = true + return attributes, closed + elsif match = @source.match(QNAME, true) + name = match[1] + prefix = match[2] + local_part = match[3] + + unless @source.match(/\s*=\s*/um, true) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end - quote = scanner.scan(/['"]/) - unless quote + unless match = @source.match(/(['"])/, true) message = "Missing attribute value start quote: <#{name}>" raise REXML::ParseException.new(message, @source) end - unless scanner.scan(/.*#{Regexp.escape(quote)}/um) - match_data = @source.match(/^(.*?)(\/)?>/um, true) - if match_data - scanner << "/" if closed - scanner << ">" - scanner << match_data[1] - scanner.pos = pos - closed = !match_data[2].nil? - next - end - message = - "Missing attribute value end quote: <#{name}>: <#{quote}>" + quote = match[1] + start_position = @source.position + value = @source.read_until(quote) + unless value.chomp!(quote) + @source.position = start_position + message = "Missing attribute value end quote: <#{name}>: <#{quote}>" raise REXML::ParseException.new(message, @source) end - end - name = scanner[1] - prefix = scanner[2] - local_part = scanner[3] - # quote = scanner[4] - value = scanner[5] - if prefix == "xmlns" - if local_part == "xml" - if value != "http://www.w3.org/XML/1998/namespace" - msg = "The 'xml' prefix must not be bound to any other namespace "+ + @source.match(/\s*/um, true) + if prefix == "xmlns" + if local_part == "xml" + if value != "http://www.w3.org/XML/1998/namespace" + msg = "The 'xml' prefix must not be bound to any other namespace "+ + "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" + raise REXML::ParseException.new( msg, @source, self ) + end + elsif local_part == "xmlns" + msg = "The 'xmlns' prefix must not be declared "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self ) + raise REXML::ParseException.new( msg, @source, self) end - elsif local_part == "xmlns" - msg = "The 'xmlns' prefix must not be declared "+ - "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self) + curr_ns << local_part + elsif prefix + prefixes << prefix unless prefix == "xml" end - curr_ns << local_part - elsif prefix - prefixes << prefix unless prefix == "xml" - end - if attributes.has_key?(name) - msg = "Duplicate attribute #{name.inspect}" - raise REXML::ParseException.new(msg, @source, self) - end + if attributes[name] + msg = "Duplicate attribute #{name.inspect}" + raise REXML::ParseException.new(msg, @source, self) + end - attributes[name] = value + attributes[name] = value + else + message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>" + raise REXML::ParseException.new(message, @source) + end end - return attributes, closed end end end diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb index f8b232a2..36b45953 100644 --- a/lib/rexml/parsers/pullparser.rb +++ b/lib/rexml/parsers/pullparser.rb @@ -47,6 +47,10 @@ def add_listener( listener ) @listeners << listener end + def entity_expansion_count + @parser.entity_expansion_count + end + def each while has_next? yield self.pull diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 6a24ce22..cec9d2fc 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -22,6 +22,10 @@ def source @parser.source end + def entity_expansion_count + @parser.entity_expansion_count + end + def add_listener( listener ) @parser.add_listener( listener ) end @@ -157,25 +161,8 @@ def parse end end when :text - #normalized = @parser.normalize( event[1] ) - #handle( :characters, normalized ) - copy = event[1].clone - - esub = proc { |match| - if @entities.has_key?($1) - @entities[$1].gsub(Text::REFERENCE, &esub) - else - match - end - } - - copy.gsub!( Text::REFERENCE, &esub ) - copy.gsub!( Text::NUMERICENTITY ) {|m| - m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') - } - handle( :characters, copy ) + unnormalized = @parser.unnormalize( event[1], @entities ) + handle( :characters, unnormalized ) when :entitydecl handle_entitydecl( event ) when :processing_instruction, :comment, :attlistdecl, diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index 9e0eb0b3..fa3ac496 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -36,8 +36,8 @@ def parse @listener.tag_end( event[1] ) @tag_stack.pop when :text - normalized = @parser.unnormalize( event[1] ) - @listener.text( normalized ) + unnormalized = @parser.unnormalize( event[1] ) + @listener.text( unnormalized ) when :processing_instruction @listener.instruction( *event[1,2] ) when :start_doctype diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index bf9a4254..0cb6f7cc 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -16,7 +16,6 @@ def add_listener( listener ) def parse tag_stack = [] - in_doctype = false entities = nil begin while true @@ -39,17 +38,15 @@ def parse tag_stack.pop @build_context = @build_context.parent when :text - if not in_doctype - if @build_context[-1].instance_of? Text - @build_context[-1] << event[1] - else - @build_context.add( - Text.new(event[1], @build_context.whitespace, nil, true) - ) unless ( - @build_context.ignore_whitespace_nodes and - event[1].strip.size==0 - ) - end + if @build_context[-1].instance_of? Text + @build_context[-1] << event[1] + else + @build_context.add( + Text.new(event[1], @build_context.whitespace, nil, true) + ) unless ( + @build_context.ignore_whitespace_nodes and + event[1].strip.size==0 + ) end when :comment c = Comment.new( event[1] ) @@ -60,14 +57,12 @@ def parse when :processing_instruction @build_context.add( Instruction.new( event[1], event[2] ) ) when :end_doctype - in_doctype = false entities.each { |k,v| entities[k] = @build_context.entities[k].value } @build_context = @build_context.parent when :start_doctype doctype = DocType.new( event[1..-1], @build_context ) @build_context = doctype entities = {} - in_doctype = true when :attlistdecl n = AttlistDecl.new( event[1..-1] ) @build_context.add( n ) diff --git a/lib/rexml/parsers/xpathparser.rb b/lib/rexml/parsers/xpathparser.rb index d92678fe..bd3b6856 100644 --- a/lib/rexml/parsers/xpathparser.rb +++ b/lib/rexml/parsers/xpathparser.rb @@ -1,4 +1,5 @@ # frozen_string_literal: false + require_relative '../namespace' require_relative '../xmltokens' @@ -38,108 +39,143 @@ def predicate path parsed end - def abbreviate( path ) - path = path.kind_of?(String) ? parse( path ) : path - string = "" - document = false - while path.size > 0 - op = path.shift + def abbreviate(path_or_parsed) + if path_or_parsed.kind_of?(String) + parsed = parse(path_or_parsed) + else + parsed = path_or_parsed + end + components = [] + component = nil + while parsed.size > 0 + op = parsed.shift case op when :node + component << "node()" when :attribute - string << "/" if string.size > 0 - string << "@" + component = "@" + components << component when :child - string << "/" if string.size > 0 + component = "" + components << component when :descendant_or_self - string << "/" + next_op = parsed[0] + if next_op == :node + parsed.shift + component = "" + components << component + else + component = "descendant-or-self::" + components << component + end when :self - string << "." + next_op = parsed[0] + if next_op == :node + parsed.shift + components << "." + else + component = "self::" + components << component + end when :parent - string << ".." + next_op = parsed[0] + if next_op == :node + parsed.shift + components << ".." + else + component = "parent::" + components << component + end when :any - string << "*" + component << "*" when :text - string << "text()" + component << "text()" when :following, :following_sibling, :ancestor, :ancestor_or_self, :descendant, :namespace, :preceding, :preceding_sibling - string << "/" unless string.size == 0 - string << op.to_s.tr("_", "-") - string << "::" + component = op.to_s.tr("_", "-") << "::" + components << component when :qname - prefix = path.shift - name = path.shift - string << prefix+":" if prefix.size > 0 - string << name + prefix = parsed.shift + name = parsed.shift + component << prefix+":" if prefix.size > 0 + component << name when :predicate - string << '[' - string << predicate_to_string( path.shift ) {|x| abbreviate( x ) } - string << ']' + component << '[' + component << predicate_to_path(parsed.shift) {|x| abbreviate(x)} + component << ']' when :document - document = true + components << "" when :function - string << path.shift - string << "( " - string << predicate_to_string( path.shift[0] ) {|x| abbreviate( x )} - string << " )" + component << parsed.shift + component << "( " + component << predicate_to_path(parsed.shift[0]) {|x| abbreviate(x)} + component << " )" when :literal - string << %Q{ "#{path.shift}" } + component << quote_literal(parsed.shift) else - string << "/" unless string.size == 0 - string << "UNKNOWN(" - string << op.inspect - string << ")" + component << "UNKNOWN(" + component << op.inspect + component << ")" end end - string = "/"+string if document - return string + case components + when [""] + "/" + when ["", ""] + "//" + else + components.join("/") + end end - def expand( path ) - path = path.kind_of?(String) ? parse( path ) : path - string = "" + def expand(path_or_parsed) + if path_or_parsed.kind_of?(String) + parsed = parse(path_or_parsed) + else + parsed = path_or_parsed + end + path = "" document = false - while path.size > 0 - op = path.shift + while parsed.size > 0 + op = parsed.shift case op when :node - string << "node()" + path << "node()" when :attribute, :child, :following, :following_sibling, :ancestor, :ancestor_or_self, :descendant, :descendant_or_self, :namespace, :preceding, :preceding_sibling, :self, :parent - string << "/" unless string.size == 0 - string << op.to_s.tr("_", "-") - string << "::" + path << "/" unless path.size == 0 + path << op.to_s.tr("_", "-") + path << "::" when :any - string << "*" + path << "*" when :qname - prefix = path.shift - name = path.shift - string << prefix+":" if prefix.size > 0 - string << name + prefix = parsed.shift + name = parsed.shift + path << prefix+":" if prefix.size > 0 + path << name when :predicate - string << '[' - string << predicate_to_string( path.shift ) { |x| expand(x) } - string << ']' + path << '[' + path << predicate_to_path( parsed.shift ) { |x| expand(x) } + path << ']' when :document document = true else - string << "/" unless string.size == 0 - string << "UNKNOWN(" - string << op.inspect - string << ")" + path << "UNKNOWN(" + path << op.inspect + path << ")" end end - string = "/"+string if document - return string + path = "/"+path if document + path end - def predicate_to_string( path, &block ) - string = "" - case path[0] + def predicate_to_path(parsed, &block) + path = "" + case parsed[0] when :and, :or, :mult, :plus, :minus, :neq, :eq, :lt, :gt, :lteq, :gteq, :div, :mod, :union - op = path.shift + op = parsed.shift case op when :eq op = "=" @@ -156,36 +192,50 @@ def predicate_to_string( path, &block ) when :union op = "|" end - left = predicate_to_string( path.shift, &block ) - right = predicate_to_string( path.shift, &block ) - string << " " - string << left - string << " " - string << op.to_s - string << " " - string << right - string << " " + left = predicate_to_path( parsed.shift, &block ) + right = predicate_to_path( parsed.shift, &block ) + path << left + path << " " + path << op.to_s + path << " " + path << right when :function - path.shift - name = path.shift - string << name - string << "( " - string << predicate_to_string( path.shift, &block ) - string << " )" + parsed.shift + name = parsed.shift + path << name + path << "(" + parsed.shift.each_with_index do |argument, i| + path << ", " if i > 0 + path << predicate_to_path(argument, &block) + end + path << ")" when :literal - path.shift - string << " " - string << path.shift.inspect - string << " " + parsed.shift + path << quote_literal(parsed.shift) else - string << " " - string << yield( path ) - string << " " + path << yield( parsed ) end - return string.squeeze(" ") + return path.squeeze(" ") end + # For backward compatibility + alias_method :preciate_to_string, :predicate_to_path private + def quote_literal( literal ) + case literal + when String + # XPath 1.0 does not support escape characters. + # Assumes literal does not contain both single and double quotes. + if literal.include?("'") + "\"#{literal}\"" + else + "'#{literal}'" + end + else + literal.inspect + end + end + #LocationPath # | RelativeLocationPath # | '/' RelativeLocationPath? diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 8a01f0e1..39e92a57 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -26,10 +26,12 @@ # - REXML::Document. # - REXML::Element. # +# There's also an {REXML tutorial}[doc/rexml/tutorial_rdoc.html]. +# module REXML COPYRIGHT = "Copyright © 2001-2008 Sean Russell " DATE = "2008/019" - VERSION = "3.2.5" + VERSION = "3.3.3" REVISION = "" Copyright = COPYRIGHT diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 90b370b9..ff887fc0 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -1,8 +1,28 @@ # coding: US-ASCII # frozen_string_literal: false + +require "strscan" + require_relative 'encoding' module REXML + if StringScanner::Version < "1.0.0" + module StringScannerCheckScanString + refine StringScanner do + def check(pattern) + pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) + super(pattern) + end + + def scan(pattern) + pattern = /#{Regexp.escape(pattern)}/ if pattern.is_a?(String) + super(pattern) + end + end + end + using StringScannerCheckScanString + end + # Generates Source-s. USE THIS CLASS. class SourceFactory # Generates a Source object @@ -30,18 +50,27 @@ def SourceFactory::create_from(arg) # objects and provides consumption of text class Source include Encoding - # The current buffer (what we're going to read next) - attr_reader :buffer # The line number of the last consumed text attr_reader :line attr_reader :encoding + module Private + SCANNER_RESET_SIZE = 100000 + PRE_DEFINED_TERM_PATTERNS = {} + pre_defined_terms = ["'", '"', "<"] + pre_defined_terms.each do |term| + PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ + end + end + private_constant :Private + # Constructor # @param arg must be a String, and should be a valid XML document # @param encoding if non-null, sets the encoding of the source to this # value, overriding all encoding detection def initialize(arg, encoding=nil) - @orig = @buffer = arg + @orig = arg + @scanner = StringScanner.new(@orig) if encoding self.encoding = encoding else @@ -50,6 +79,20 @@ def initialize(arg, encoding=nil) @line = 0 end + # The current buffer (what we're going to read next) + def buffer + @scanner.rest + end + + def drop_parsed_content + if @scanner.pos > Private::SCANNER_RESET_SIZE + @scanner.string = @scanner.rest + end + end + + def buffer_encoding=(encoding) + @scanner.string.force_encoding(encoding) + end # Inherited from Encoding # Overridden to support optimized en/decoding @@ -58,98 +101,78 @@ def encoding=(enc) encoding_updated end - # Scans the source for a given pattern. Note, that this is not your - # usual scan() method. For one thing, the pattern argument has some - # requirements; for another, the source can be consumed. You can easily - # confuse this method. Originally, the patterns were easier - # to construct and this method more robust, because this method - # generated search regexps on the fly; however, this was - # computationally expensive and slowed down the entire REXML package - # considerably, since this is by far the most commonly called method. - # @param pattern must be a Regexp, and must be in the form of - # /^\s*(#{your pattern, with no groups})(.*)/. The first group - # will be returned; the second group is used if the consume flag is - # set. - # @param consume if true, the pattern returned will be consumed, leaving - # everything after it in the Source. - # @return the pattern, if found, or nil if the Source is empty or the - # pattern is not found. - def scan(pattern, cons=false) - return nil if @buffer.nil? - rv = @buffer.scan(pattern) - @buffer = $' if cons and rv.size>0 - rv + def read(term = nil) end - def read + def read_until(term) + pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ + data = @scanner.scan_until(pattern) + unless data + data = @scanner.rest + @scanner.pos = @scanner.string.bytesize + end + data end - def consume( pattern ) - @buffer = $' if pattern.match( @buffer ) + def ensure_buffer end - def match_to( char, pattern ) - return pattern.match(@buffer) + def match(pattern, cons=false) + if cons + @scanner.scan(pattern).nil? ? nil : @scanner + else + @scanner.check(pattern).nil? ? nil : @scanner + end end - def match_to_consume( char, pattern ) - md = pattern.match(@buffer) - @buffer = $' - return md + def position + @scanner.pos end - def match(pattern, cons=false) - md = pattern.match(@buffer) - @buffer = $' if cons and md - return md + def position=(pos) + @scanner.pos = pos end # @return true if the Source is exhausted def empty? - @buffer == "" - end - - def position - @orig.index( @buffer ) + @scanner.eos? end # @return the current line in the source def current_line lines = @orig.split - res = lines.grep @buffer[0..30] + res = lines.grep @scanner.rest[0..30] res = res[-1] if res.kind_of? Array lines.index( res ) if res end private + def detect_encoding - buffer_encoding = @buffer.encoding + scanner_encoding = @scanner.rest.encoding detected_encoding = "UTF-8" begin - @buffer.force_encoding("ASCII-8BIT") - if @buffer[0, 2] == "\xfe\xff" - @buffer[0, 2] = "" + @scanner.string.force_encoding("ASCII-8BIT") + if @scanner.scan(/\xfe\xff/n) detected_encoding = "UTF-16BE" - elsif @buffer[0, 2] == "\xff\xfe" - @buffer[0, 2] = "" + elsif @scanner.scan(/\xff\xfe/n) detected_encoding = "UTF-16LE" - elsif @buffer[0, 3] == "\xef\xbb\xbf" - @buffer[0, 3] = "" + elsif @scanner.scan(/\xef\xbb\xbf/n) detected_encoding = "UTF-8" end ensure - @buffer.force_encoding(buffer_encoding) + @scanner.string.force_encoding(scanner_encoding) end self.encoding = detected_encoding end def encoding_updated if @encoding != 'UTF-8' - @buffer = decode(@buffer) + @scanner.string = decode(@scanner.rest) @to_utf = true else @to_utf = false - @buffer.force_encoding ::Encoding::UTF_8 + @scanner.string.force_encoding(::Encoding::UTF_8) end end end @@ -172,7 +195,7 @@ def initialize(arg, block_size=500, encoding=nil) end if !@to_utf and - @buffer.respond_to?(:force_encoding) and + @orig.respond_to?(:force_encoding) and @source.respond_to?(:external_encoding) and @source.external_encoding != ::Encoding::UTF_8 @force_utf8 = true @@ -181,65 +204,72 @@ def initialize(arg, block_size=500, encoding=nil) end end - def scan(pattern, cons=false) - rv = super - # You'll notice that this next section is very similar to the same - # section in match(), but just a liiittle different. This is - # because it is a touch faster to do it this way with scan() - # than the way match() does it; enough faster to warrant duplicating - # some code - if rv.size == 0 - until @buffer =~ pattern or @source.nil? - begin - @buffer << readline - rescue Iconv::IllegalSequence - raise - rescue - @source = nil + def read(term = nil, min_bytes = 1) + term = encode(term) if term + begin + str = readline(term) + @scanner << str + read_bytes = str.bytesize + begin + while read_bytes < min_bytes + str = readline(term) + @scanner << str + read_bytes += str.bytesize end + rescue IOError end - rv = super + true + rescue Exception, NameError + @source = nil + false end - rv.taint if RUBY_VERSION < '2.7' - rv end - def read - begin - @buffer << readline - rescue Exception, NameError - @source = nil + def read_until(term) + pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ + term = encode(term) + until str = @scanner.scan_until(pattern) + break if @source.nil? + break if @source.eof? + @scanner << readline(term) + end + if str + read if @scanner.eos? and !@source.eof? + str + else + rest = @scanner.rest + @scanner.pos = @scanner.string.bytesize + rest end end - def consume( pattern ) - match( pattern, true ) + def ensure_buffer + read if @scanner.eos? && @source end def match( pattern, cons=false ) - rv = pattern.match(@buffer) - @buffer = $' if cons and rv - while !rv and @source - begin - @buffer << readline - rv = pattern.match(@buffer) - @buffer = $' if cons and rv - rescue - @source = nil + # To avoid performance issue, we need to increase bytes to read per scan + min_bytes = 1 + while true + if cons + md = @scanner.scan(pattern) + else + md = @scanner.check(pattern) end + break if md + return nil if pattern.is_a?(String) + return nil if @source.nil? + return nil unless read(nil, min_bytes) + min_bytes *= 2 end - rv.taint if RUBY_VERSION < '2.7' - rv + + md.nil? ? nil : @scanner end def empty? super and ( @source.nil? || @source.eof? ) end - def position - @er_source.pos rescue 0 - end - # @return the current line in the source def current_line begin @@ -263,8 +293,8 @@ def current_line end private - def readline - str = @source.readline(@line_break) + def readline(term = nil) + str = @source.readline(term || @line_break) if @pending_buffer if str.nil? str = @pending_buffer @@ -290,7 +320,7 @@ def encoding_updated @source.set_encoding(@encoding, @encoding) end @line_break = encode(">") - @pending_buffer, @buffer = @buffer, "" + @pending_buffer, @scanner.string = @scanner.rest, "" @pending_buffer.force_encoding(@encoding) super end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 050b09c9..7e0befe9 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -1,4 +1,4 @@ -# frozen_string_literal: false +# frozen_string_literal: true require_relative 'security' require_relative 'entity' require_relative 'doctype' @@ -131,7 +131,7 @@ def parent= parent def Text.check string, pattern, doctype # illegal anywhere - if string !~ VALID_XML_CHARS + if !string.match?(VALID_XML_CHARS) if String.method_defined? :encode string.chars.each do |c| case c.ord @@ -151,25 +151,45 @@ def Text.check string, pattern, doctype end end - # context sensitive - string.scan(pattern) do - if $1[-1] != ?; - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" - elsif $1[0] == ?& - if $5 and $5[0] == ?# - case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) - when *VALID_CHAR + pos = 0 + while (index = string.index(/<|&/, pos)) + if string[index] == "<" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + unless (end_index = string.index(/[^\s];/, index + 1)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + value = string[(index + 1)..end_index] + if /\s/.match?(value) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" + end + + if value[0] == "#" + character_reference = value[1..-1] + + unless (/\A(\d+|x[0-9a-fA-F]+)\z/.match?(character_reference)) + if character_reference[0] == "x" || character_reference[-1] == "x" + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" else - raise "Illegal character #{$1.inspect} in raw string #{string.inspect}" + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" end - # FIXME: below can't work but this needs API change. - # elsif @parent and $3 and !SUBSTITUTES.include?($1) - # if !doctype or !doctype.entities.has_key?($3) - # raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" - # end end + + case (character_reference[0] == "x" ? character_reference[1..-1].to_i(16) : character_reference[0..-1].to_i) + when *VALID_CHAR + else + raise "Illegal character #{string.inspect} in raw string #{string.inspect}" + end + elsif !(/\A#{Entity::NAME}\z/um.match?(value)) + raise "Illegal character \"#{string[index]}\" in raw string #{string.inspect}" end + + pos = end_index + 1 end + + string end def node_type @@ -371,7 +391,7 @@ def Text::normalize( input, doctype=nil, entity_filter=nil ) copy = input.to_s # Doing it like this rather than in a loop improves the speed #copy = copy.gsub( EREFERENCE, '&' ) - copy = copy.gsub( "&", "&" ) + copy = copy.gsub( "&", "&" ) if copy.include?("&") if doctype # Replace all ampersands that aren't part of an entity doctype.entities.each_value do |entity| @@ -382,7 +402,9 @@ def Text::normalize( input, doctype=nil, entity_filter=nil ) else # Replace all ampersands that aren't part of an entity DocType::DEFAULT_ENTITIES.each_value do |entity| - copy = copy.gsub(entity.value, "&#{entity.name};" ) + if copy.include?(entity.value) + copy = copy.gsub(entity.value, "&#{entity.name};" ) + end end end copy diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index d8b88e7a..5eb1e5a9 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -590,6 +590,7 @@ def filter_nodeset(nodeset) def evaluate_predicate(expression, nodesets) enter(:predicate, expression, nodesets) if @debug + new_nodeset_count = 0 new_nodesets = nodesets.collect do |nodeset| new_nodeset = [] subcontext = { :size => nodeset.size } @@ -606,17 +607,20 @@ def evaluate_predicate(expression, nodesets) result = result[0] if result.kind_of? Array and result.length == 1 if result.kind_of? Numeric if result == node.position - new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1) + new_nodeset_count += 1 + new_nodeset << XPathNode.new(node, position: new_nodeset_count) end elsif result.instance_of? Array if result.size > 0 and result.inject(false) {|k,s| s or k} if result.size > 0 - new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1) + new_nodeset_count += 1 + new_nodeset << XPathNode.new(node, position: new_nodeset_count) end end else if result - new_nodeset << XPathNode.new(node, position: new_nodeset.size + 1) + new_nodeset_count += 1 + new_nodeset << XPathNode.new(node, position: new_nodeset_count) end end end diff --git a/rexml.gemspec b/rexml.gemspec index 620a8981..0de3e845 100644 --- a/rexml.gemspec +++ b/rexml.gemspec @@ -16,6 +16,10 @@ Gem::Specification.new do |spec| spec.homepage = "https://github.com/ruby/rexml" spec.license = "BSD-2-Clause" + spec.metadata = { + "changelog_uri" => "#{spec.homepage}/releases/tag/v#{spec.version}" + } + files = [ "LICENSE.txt", "NEWS.md", @@ -52,10 +56,8 @@ Gem::Specification.new do |spec| spec.files = files spec.rdoc_options.concat(["--main", "README.md"]) spec.extra_rdoc_files = rdoc_files - spec.bindir = "exe" - spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } - spec.add_development_dependency "bundler" - spec.add_development_dependency "rake" - spec.add_development_dependency "test-unit" + spec.required_ruby_version = '>= 2.5.0' + + spec.add_runtime_dependency("strscan") end diff --git a/test/data/much_ado.xml b/test/data/much_ado.xml index f008fadb..0040088c 100644 --- a/test/data/much_ado.xml +++ b/test/data/much_ado.xml @@ -4735,7 +4735,7 @@ CLAUDIO, BENEDICK, HERO, BEATRICE, and Attendants But they shall find, awaked in such a kind, Both strength of limb and policy of mind, Ability in means and choice of friends, -To quit me of them throughly. +To quit me of them thoroughly. diff --git a/test/data/ofbiz-issues-full-177.xml b/test/data/ofbiz-issues-full-177.xml index bfff771d..e1f7bdfd 100644 --- a/test/data/ofbiz-issues-full-177.xml +++ b/test/data/ofbiz-issues-full-177.xml @@ -152,8 +152,8 @@ - - + + diff --git a/test/data/test/tests.xml b/test/data/test/tests.xml index cf03b42b..fd415679 100644 --- a/test/data/test/tests.xml +++ b/test/data/test/tests.xml @@ -299,7 +299,7 @@ - + web-app web-app web-app @@ -318,7 +318,7 @@ - + web-app web-app web-app diff --git a/test/data/tutorial.xml b/test/data/tutorial.xml index bf5783d0..9c4639b9 100644 --- a/test/data/tutorial.xml +++ b/test/data/tutorial.xml @@ -286,7 +286,7 @@ el1 << Text.new(" cruel world") strings.

I can't emphasize this enough, because people do have problems with - this. REXML can't possibly alway guess correctly how your text is + this. REXML can't possibly always guess correctly how your text is encoded, so it always assumes the text is UTF-8. It also does not warn you when you try to add text which isn't properly encoded, for the same reason. You must make sure that you are adding UTF-8 text. diff --git a/test/formatter/test_default.rb b/test/formatter/test_default.rb index 321d8180..aa403dbe 100644 --- a/test/formatter/test_default.rb +++ b/test/formatter/test_default.rb @@ -2,7 +2,7 @@ module REXMLTests class DefaultFormatterTest < Test::Unit::TestCase def format(node) formatter = REXML::Formatters::Default.new - output = "" + output = +"" formatter.write(node, output) output end diff --git a/test/functions/test_base.rb b/test/functions/test_base.rb index 74dc1a31..daa38156 100644 --- a/test/functions/test_base.rb +++ b/test/functions/test_base.rb @@ -229,8 +229,30 @@ def test_normalize_space assert_equal( [REXML::Comment.new("COMMENT A")], m ) end + def test_normalize_space_strings + source = <<-XML +breakfast boosts\t\t + +concentration +Coffee beans + aroma + + + + Dessert + \t\t after dinner + XML + normalized_texts = REXML::XPath.each(REXML::Document.new(source), "normalize-space(//text())").to_a + assert_equal([ + "breakfast boosts concentration", + "Coffee beans aroma", + "Dessert after dinner", + ], + normalized_texts) + end + def test_string_nil_without_context - doc = REXML::Document.new(<<-XML) + doc = REXML::Document.new(<<~XML) diff --git a/test/parse/test_attribute_list_declaration.rb b/test/parse/test_attribute_list_declaration.rb new file mode 100644 index 00000000..43882528 --- /dev/null +++ b/test/parse/test_attribute_list_declaration.rb @@ -0,0 +1,30 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseAttributeListDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_linear_performance_space + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]>") + end + end + + def test_linear_performance_tab_and_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + + "\">]>") + end + end + end +end diff --git a/test/parse/test_cdata.rb b/test/parse/test_cdata.rb new file mode 100644 index 00000000..b5f1a3bc --- /dev/null +++ b/test/parse/test_cdata.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCData < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_linear_performance_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('" * n + ' ]]>') + end + end + end +end diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb new file mode 100644 index 00000000..bf8d2190 --- /dev/null +++ b/test/parse/test_character_reference.rb @@ -0,0 +1,17 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseCharacterReference < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def test_linear_performance_many_preceding_zeros + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + end +end diff --git a/test/parse/test_comment.rb b/test/parse/test_comment.rb new file mode 100644 index 00000000..4475dca7 --- /dev/null +++ b/test/parse/test_comment.rb @@ -0,0 +1,151 @@ +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseComment < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + def parse(xml) + REXML::Document.new(xml) + end + + class TestInvalid < self + def test_toplevel_unclosed_comment + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 11 + Last 80 unconsumed characters: + DETAIL + end + + def test_toplevel_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 9 + Last 80 unconsumed characters: + DETAIL + end + + def test_doctype_malformed_comment_inner + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 26 + Last 80 unconsumed characters: + DETAIL + end + + def test_doctype_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 24 + Last 80 unconsumed characters: + DETAIL + end + + def test_after_doctype_malformed_comment_short + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed comment + Line: 1 + Position: 8 + Last 80 unconsumed characters: + --> + DETAIL + end + + def test_after_doctype_malformed_comment_inner + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 14 + Last 80 unconsumed characters: + DETAIL + end + + def test_after_doctype_malformed_comment_end + exception = assert_raise(REXML::ParseException) do + parse("") + end + assert_equal(<<~DETAIL, exception.to_s) + Malformed comment + Line: 1 + Position: 12 + Last 80 unconsumed characters: + DETAIL + end + end + + def test_before_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal(" ok comment ", events[:comment]) + end + + def test_after_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal(" ok comment ", events[:comment]) + end + + def test_linear_performance_top_level_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + + def test_linear_performance_in_element_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new('') + end + end + end +end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 55713909..99c23745 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -1,9 +1,13 @@ # frozen_string_literal: false require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseDocumentTypeDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + private def parse(doctype) REXML::Document.new(<<-XML).doctype @@ -36,6 +40,66 @@ def test_garbage_plus_before_name_at_line_start + r SYSTEM "urn:x-rexml:test" [ ]> DETAIL end + + def test_no_name + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + + DOCTYPE + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed DOCTYPE: name is missing +Line: 3 +Position: 17 +Last 80 unconsumed characters: + + DETAIL + end + end + + class TestUnclosed < self + def test_no_extra_node + exception = assert_raise(REXML::ParseException) do + REXML::Document.new(" + DOCTYPE + end + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed DOCTYPE: invalid declaration + Line: 1 + Position: 20 + Last 80 unconsumed characters: + #{' '} + DETAIL + end + + def test_text + exception = assert_raise(REXML::ParseException) do + REXML::Document.new(<<~DOCTYPE) + " * n + "]>") + rescue + end + end + end + + def test_linear_performance_comment_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + " -->]>") + end + end + + def test_linear_performance_external_entity_right_bracket_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + ";]>") + end + end end end diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index 9f172a28..2b0746ea 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests class TestParseElement < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -41,9 +45,22 @@ def test_empty_namespace_attribute_name assert_equal(<<-DETAIL.chomp, exception.to_s) Invalid attribute name: <:a=""> Line: 1 -Position: 9 +Position: 13 Last 80 unconsumed characters: +:a=""> + DETAIL + end + def test_empty_namespace_attribute_name_with_utf8_character + exception = assert_raise(REXML::ParseException) do + parse("") # U+200B ZERO WIDTH SPACE + end + assert_equal(<<-DETAIL.chomp.force_encoding("ASCII-8BIT"), exception.to_s) +Invalid attribute name: <:\xE2\x80\x8B> +Line: 1 +Position: 8 +Last 80 unconsumed characters: +:\xE2\x80\x8B> DETAIL end @@ -72,6 +89,47 @@ def test_garbage_less_than_slash_before_end_tag_at_line_start DETAIL end + + def test_after_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra tag at the end of the document (got '') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra tag at the end of the document (got '" * n + '">') + end end end end diff --git a/test/parse/test_entity_declaration.rb b/test/parse/test_entity_declaration.rb new file mode 100644 index 00000000..81d95b58 --- /dev/null +++ b/test/parse/test_entity_declaration.rb @@ -0,0 +1,557 @@ +# frozen_string_literal: false +require "test/unit" +require "core_assertions" + +require "rexml/document" + +module REXMLTests + class TestParseEntityDeclaration < Test::Unit::TestCase + include Test::Unit::CoreAssertions + + private + def xml(internal_subset) + <<-XML + + + XML + end + + def parse(internal_subset) + REXML::Document.new(xml(internal_subset)).doctype + end + + public + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-GEDecl + class TestGeneralEntityDeclaration < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name + class TestName < self + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 61 +Last 80 unconsumed characters: + invalid&name "valid-entity-value">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityDef + class TestEntityDefinition < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue + class TestEntityValue < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 59 +Last 80 unconsumed characters: + valid-name invalid-entity-value>]> + DETAIL + end + + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 44 +Last 80 unconsumed characters: + valid-name "% &">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 61 +Last 80 unconsumed characters: + valid-name "invalid-entity-value'>]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ExternalID + class TestExternalID < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-SystemLiteral + class TestSystemLiteral < self + def test_no_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 68 +Last 80 unconsumed characters: + valid-name SYSTEM invalid-system-literal>]> + DETAIL + end + + def test_no_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 90 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal" invalid-system-literal>]> + DETAIL + end + + def test_mixed_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 70 +Last 80 unconsumed characters: + valid-name SYSTEM 'invalid-system-literal">]> + DETAIL + end + + def test_mixed_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal" "invalid-system-literal'>]> + DETAIL + end + + def test_no_literal_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 45 +Last 80 unconsumed characters: + valid-name SYSTEM>]> + DETAIL + end + + def test_no_literal_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 67 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidChar + class TestPublicIDLiteral < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 90 +Last 80 unconsumed characters: + valid-name PUBLIC invalid-pubid-literal "valid-system-literal">]> + DETAIL + end + + def test_prohibited_pubid_character + exception = assert_raise(REXML::ParseException) do + # U+3042 HIRAGANA LETTER A + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.force_encoding('utf-8').chomp, exception.to_s.force_encoding('utf-8')) +Malformed entity declaration +Line: 1 +Position: 74 +Last 80 unconsumed characters: + valid-name PUBLIC "\u3042" "valid-system-literal">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + valid-name PUBLIC "invalid-pubid-literal' "valid-system-literal">]> + DETAIL + end + + def test_no_literal + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 45 +Last 80 unconsumed characters: + valid-name PUBLIC>]> + DETAIL + end + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NDataDecl + class TestNotationDataDeclaration < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameChar + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 109 +Last 80 unconsumed characters: + valid-name PUBLIC "valid-pubid-literal" "valid-system-literal" NDATA invalid&nam + DETAIL + end + end + + def test_entity_value_and_notation_data_declaration + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 83 +Last 80 unconsumed characters: + valid-name "valid-entity-value" NDATA valid-ndata-value>]> + DETAIL + end + end + + def test_no_space + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 102 +Last 80 unconsumed characters: + valid-namePUBLIC"valid-pubid-literal""valid-system-literal"NDATAvalid-name>]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEDecl + class TestParsedEntityDeclaration < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name + class TestName < self + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 63 +Last 80 unconsumed characters: + % invalid&name "valid-entity-value">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEDef + class TestParsedEntityDefinition < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue + class TestEntityValue < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 61 +Last 80 unconsumed characters: + % valid-name invalid-entity-value>]> + DETAIL + end + + def test_prohibited_character + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 46 +Last 80 unconsumed characters: + % valid-name "% &">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 63 +Last 80 unconsumed characters: + % valid-name 'invalid-entity-value">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ExternalID + class TestExternalID < self + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-SystemLiteral + class TestSystemLiteral < self + def test_no_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 70 +Last 80 unconsumed characters: + % valid-name SYSTEM invalid-system-literal>]> + DETAIL + end + + def test_no_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + % valid-name PUBLIC "valid-pubid-literal" invalid-system-literal>]> + DETAIL + end + + def test_mixed_quote_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 72 +Last 80 unconsumed characters: + % valid-name SYSTEM "invalid-system-literal'>]> + DETAIL + end + + def test_mixed_quote_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 94 +Last 80 unconsumed characters: + % valid-name PUBLIC "valid-pubid-literal" 'invalid-system-literal">]> + DETAIL + end + + def test_no_literal_in_system + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 47 +Last 80 unconsumed characters: + % valid-name SYSTEM>]> + DETAIL + end + + def test_no_literal_in_public + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 69 +Last 80 unconsumed characters: + % valid-name PUBLIC "valid-pubid-literal">]> + DETAIL + end + end + + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral + # https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidChar + class TestPublicIDLiteral < self + def test_no_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 92 +Last 80 unconsumed characters: + % valid-name PUBLIC invalid-pubid-literal "valid-system-literal">]> + DETAIL + end + + def test_prohibited_pubid_character + exception = assert_raise(REXML::ParseException) do + # U+3042 HIRAGANA LETTER A + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.force_encoding('utf-8').chomp, exception.to_s.force_encoding('utf-8')) +Malformed entity declaration +Line: 1 +Position: 76 +Last 80 unconsumed characters: + % valid-name PUBLIC "\u3042" "valid-system-literal">]> + DETAIL + end + + def test_mixed_quote + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 94 +Last 80 unconsumed characters: + % valid-name PUBLIC 'invalid-pubid-literal" "valid-system-literal">]> + DETAIL + end + + def test_no_literal + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 47 +Last 80 unconsumed characters: + % valid-name PUBLIC>]> + DETAIL + end + end + end + + def test_entity_value_and_notation_data_declaration + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 85 +Last 80 unconsumed characters: + % valid-name "valid-entity-value" NDATA valid-ndata-value>]> + DETAIL + end + end + + def test_no_space + exception = assert_raise(REXML::ParseException) do + REXML::Document.new("]>") + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 1 +Position: 67 +Last 80 unconsumed characters: + %valid-nameSYSTEM"valid-system-literal">]> + DETAIL + end + end + + def test_empty + exception = assert_raise(REXML::ParseException) do + parse(<<-INTERNAL_SUBSET) + + INTERNAL_SUBSET + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed entity declaration +Line: 5 +Position: 70 +Last 80 unconsumed characters: +> ]> + DETAIL + end + + def test_linear_performance_entity_value_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + + "\">]>") + end + end + + def test_linear_performance_entity_value_gt_right_bracket + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]" * n + + "\">]>") + end + end + + def test_linear_performance_system_literal_in_system_gt_right_bracket + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]" * n + + "\">]>") + end + end + + def test_linear_performance_system_literal_in_public_gt_right_bracket + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("]" * n + + "\">]>") + end + end + end +end diff --git a/test/parse/test_notation_declaration.rb b/test/parse/test_notation_declaration.rb index 19a0536d..9e81b6a4 100644 --- a/test/parse/test_notation_declaration.rb +++ b/test/parse/test_notation_declaration.rb @@ -35,7 +35,7 @@ def test_no_name Line: 5 Position: 72 Last 80 unconsumed characters: - ]> + ]> DETAIL end diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index f0c0c24e..ba381dc4 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -1,8 +1,12 @@ require "test/unit" +require "core_assertions" + require "rexml/document" module REXMLTests - class TestParseProcessinInstruction < Test::Unit::TestCase + class TestParseProcessingInstruction < Test::Unit::TestCase + include Test::Unit::CoreAssertions + def parse(xml) REXML::Document.new(xml) end @@ -13,31 +17,110 @@ def test_no_name parse("") end assert_equal(<<-DETAIL.chomp, exception.to_s) -Invalid processing instruction node +Malformed XML: Invalid processing instruction node: invalid name Line: 1 Position: 4 Last 80 unconsumed characters: - +?> + DETAIL + end + + def test_unclosed_content + exception = assert_raise(REXML::ParseException) do + parse("') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration is not at the start + Line: 1 + Position: 25 + Last 80 unconsumed characters: + DETAIL end + end - def test_garbage_text - # TODO: This should be parse error. - # Create test/parse/test_document.rb or something and move this to it. - doc = parse(<<-XML) -x?> - XML - pi = doc.children[1] - assert_equal([ - "x", - "y\n"]], + [[doc.children[0].target, doc.children[0].content], + [doc.children[1].target, doc.children[1].content]]) + end + + def test_before_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal("abc", events[:processing_instruction]) + end + + def test_after_root + parser = REXML::Parsers::BaseParser.new('') + + events = {} + while parser.has_next? + event = parser.pull + events[event[0]] = event[1] + end + + assert_equal("abc", events[:processing_instruction]) + end + + def test_content_question + document = REXML::Document.new("") + assert_equal("con?tent", document.root.children.first.content) + end + + def test_linear_performance_gt + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new("" * n + " ?>") + end + end + + def test_linear_performance_tab + seq = [10000, 50000, 100000, 150000, 200000] + assert_linear_performance(seq, rehearsal: 10) do |n| + REXML::Document.new(" ?>") end end end diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb new file mode 100644 index 00000000..04f553ae --- /dev/null +++ b/test/parse/test_text.rb @@ -0,0 +1,57 @@ +require "test/unit" +require 'rexml/parsers/baseparser' + +module REXMLTests + class TestParseText < Test::Unit::TestCase + class TestInvalid < self + def test_before_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('b') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Content at the start of the document (got 'b') + Line: 1 + Position: 4 + Last 80 unconsumed characters: + + DETAIL + end + + def test_after_root + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('c') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: Extra content at the end of the document (got 'c') + Line: 1 + Position: 8 + Last 80 unconsumed characters: + + DETAIL + end + end + + def test_whitespace_characters_after_root + parser = REXML::Parsers::BaseParser.new('b ') + + events = [] + while parser.has_next? + event = parser.pull + case event[0] + when :text + events << event[1] + end + end + + assert_equal(["b"], events) + end + end +end diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb new file mode 100644 index 00000000..17d01979 --- /dev/null +++ b/test/parser/test_base_parser.rb @@ -0,0 +1,27 @@ +# frozen_string_literal: false + +require 'rexml/parsers/baseparser' + +module REXMLTests + class BaseParserTester < Test::Unit::TestCase + def test_large_xml + large_text = "a" * 100_000 + xml = <<-XML + + + #{large_text} + #{large_text} + + XML + + parser = REXML::Parsers::BaseParser.new(xml) + while parser.has_next? + parser.pull + end + + assert do + parser.position < xml.bytesize + end + end + end +end diff --git a/test/parser/test_ultra_light.rb b/test/parser/test_ultra_light.rb index 44fd1d1e..b3f576ff 100644 --- a/test/parser/test_ultra_light.rb +++ b/test/parser/test_ultra_light.rb @@ -17,7 +17,6 @@ def test_entity_declaration [:entitydecl, "name", "value"] ], [:start_element, :parent, "root", {}], - [:text, "\n"], ], parse(<<-INTERNAL_SUBSET)) diff --git a/test/parser/test_xpath.rb b/test/parser/test_xpath.rb new file mode 100644 index 00000000..9143d25c --- /dev/null +++ b/test/parser/test_xpath.rb @@ -0,0 +1,115 @@ +# frozen_string_literal: false + +require "test/unit" +require "rexml/parsers/xpathparser" + +module REXMLTests + class TestXPathParser < Test::Unit::TestCase + sub_test_case("#abbreviate") do + def abbreviate(xpath) + parser = REXML::Parsers::XPathParser.new + parser.abbreviate(xpath) + end + + def test_document + assert_equal("/", + abbreviate("/")) + end + + def test_descendant_or_self_only + assert_equal("//", + abbreviate("/descendant-or-self::node()/")) + end + + def test_descendant_or_self_absolute + assert_equal("//a/b", + abbreviate("/descendant-or-self::node()/a/b")) + end + + def test_descendant_or_self_relative + assert_equal("a//b", + abbreviate("a/descendant-or-self::node()/b")) + end + + def test_descendant_or_self_not_node + assert_equal("/descendant-or-self::text()", + abbreviate("/descendant-or-self::text()")) + end + + def test_self_absolute + assert_equal("/a/./b", + abbreviate("/a/self::node()/b")) + end + + def test_self_relative + assert_equal("a/./b", + abbreviate("a/self::node()/b")) + end + + def test_self_not_node + assert_equal("/self::text()", + abbreviate("/self::text()")) + end + + def test_parent_absolute + assert_equal("/a/../b", + abbreviate("/a/parent::node()/b")) + end + + def test_parent_relative + assert_equal("a/../b", + abbreviate("a/parent::node()/b")) + end + + def test_parent_not_node + assert_equal("/a/parent::text()", + abbreviate("/a/parent::text()")) + end + + def test_any_absolute + assert_equal("/*/a", + abbreviate("/*/a")) + end + + def test_any_relative + assert_equal("a/*/b", + abbreviate("a/*/b")) + end + + def test_following_sibling_absolute + assert_equal("/following-sibling::a/b", + abbreviate("/following-sibling::a/b")) + end + + def test_following_sibling_relative + assert_equal("a/following-sibling::b/c", + abbreviate("a/following-sibling::b/c")) + end + + def test_predicate_index + assert_equal("a[5]/b", + abbreviate("a[5]/b")) + end + + def test_attribute_relative + assert_equal("a/@b", + abbreviate("a/attribute::b")) + end + + def test_filter_attribute + assert_equal("a/b[@i = 1]/c", + abbreviate("a/b[attribute::i=1]/c")) + end + + def test_filter_string_single_quote + assert_equal("a/b[@name = \"single ' quote\"]/c", + abbreviate("a/b[attribute::name=\"single ' quote\"]/c")) + end + + def test_filter_string_double_quote + assert_equal("a/b[@name = 'double \" quote']/c", + abbreviate("a/b[attribute::name='double \" quote']/c")) + end + end + end +end diff --git a/test/test_attributes.rb b/test/test_attributes.rb index 91fc68a5..09fde442 100644 --- a/test/test_attributes.rb +++ b/test/test_attributes.rb @@ -178,18 +178,27 @@ def test_amp_and_lf_attributes attr_test('name','value with LF & ampersand') end - def test_quoting + def test_quote_root d = Document.new(%q{}) assert_equal( %q{}, d.to_s ) d.root.context[:attribute_quote] = :quote assert_equal( %q{}, d.to_s ) + end + def test_quote_sub_element d = Document.new(%q{}) assert_equal( %q{}, d.to_s ) d.root.context[:attribute_quote] = :quote assert_equal( %q{}, d.to_s ) end + def test_quote_to_s_value + doc = Document.new(%q{}, {attribute_quote: :quote}) + assert_equal(%q{}, doc.to_s) + assert_equal("'", doc.root.attribute("a").value) + assert_equal(%q{}, doc.to_s) + end + def test_ticket_127 doc = Document.new doc.add_element 'a', { 'v' => 'x & y' } diff --git a/test/test_contrib.rb b/test/test_contrib.rb index f3ad0b6c..23ee35b1 100644 --- a/test/test_contrib.rb +++ b/test/test_contrib.rb @@ -80,7 +80,7 @@ def test_bad_doctype_Tobias # Peter Verhage def test_namespace_Peter - source = <<-EOF + source = <<~EOF @@ -377,7 +377,7 @@ def test_various_xpath end def test_entities_Holden_Glova - document = <<-EOL + document = <<~EOL diff --git a/test/test_core.rb b/test/test_core.rb index fd3af8c2..e1fba8a7 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -15,7 +15,7 @@ class Tester < Test::Unit::TestCase include Helper::Fixture include REXML def setup - @xsa_source = <<-EOL + @xsa_source = <<~EOL + + + XML + + expected_names = %w[ + root + 1_1 1_2 1_3 + 2_1 2_2 2_3 + ] + + document = REXML::Document.new(xml_source) + + # Node#each_recursive iterates elements only. + # This does not iterate XML declarations, comments, attributes, CDATA sections, etc. + actual_names = [] + document.each_recursive do |element| + actual_names << element.attributes["name"] + end + assert_equal(expected_names, actual_names) + end + class WriteTest < Test::Unit::TestCase def setup - @document = REXML::Document.new(<<-EOX) + @document = REXML::Document.new(<<-EOX.chomp) Hello world! EOX @@ -212,7 +249,7 @@ class ArgumentsTest < self def test_output output = "" @document.write(output) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! EOX @@ -235,7 +272,7 @@ def test_transitive indent = 2 transitive = true @document.write(output, indent, transitive) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! #{japanese_text} EOX @@ -275,7 +312,7 @@ class OptionsTest < self def test_output output = "" @document.write(:output => output) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! EOX @@ -295,7 +332,7 @@ def test_indent def test_transitive output = "" @document.write(:output => output, :indent => 2, :transitive => true) - assert_equal(<<-EOX, output) + assert_equal(<<-EOX.chomp, output) Hello world! output, :encoding => encoding) - assert_equal(<<-EOX.encode(encoding), output) + assert_equal(<<-EOX.chomp.encode(encoding), output) #{japanese_text} EOX @@ -401,7 +438,7 @@ def test_utf_16 actual_xml = "" document.write(actual_xml) - expected_xml = <<-EOX.encode("UTF-16BE") + expected_xml = <<-EOX.chomp.encode("UTF-16BE") \ufeff Hello world! EOX diff --git a/test/test_encoding.rb b/test/test_encoding.rb index 09495c58..6887ffbe 100644 --- a/test/test_encoding.rb +++ b/test/test_encoding.rb @@ -67,7 +67,7 @@ def test_in_different_out # * Given an encoded document, accessing text and attribute nodes # should provide UTF-8 text. def test_in_different_access - doc = Document.new <<-EOL + doc = Document.new <<~EOL \xFF EOL @@ -79,7 +79,7 @@ def test_in_different_access def test_ticket_89 - doc = Document.new <<-EOL + doc = Document.new <<~EOL EOL diff --git a/test/test_light.rb b/test/test_light.rb index 54b2c52e..c556c978 100644 --- a/test/test_light.rb +++ b/test/test_light.rb @@ -62,7 +62,7 @@ def test_access_child_elements assert_equal( 'c', a[1].name ) end - def test_itterate_over_children + def test_iterate_over_children foo = make_small_document ctr = 0 foo[0].each { ctr += 1 } diff --git a/test/test_pullparser.rb b/test/test_pullparser.rb index 53a985ba..55205af8 100644 --- a/test/test_pullparser.rb +++ b/test/test_pullparser.rb @@ -62,6 +62,63 @@ def test_entity_replacement end end + def test_character_references + source = 'AB' + parser = REXML::Parsers::PullParser.new( source ) + + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + assert_equal('A', events['a']) + assert_equal("B", events['b']) + end + + def test_text_entity_references + source = '<P> <I> <B> Text </B> </I>' + parser = REXML::Parsers::PullParser.new( source ) + + events = [] + while parser.has_next? + event = parser.pull + case event.event_type + when :text + events << event[1] + end + end + + assert_equal(["

Text "], events) + end + + def test_text_content_with_line_breaks + source = "AB\nC\r\n" + parser = REXML::Parsers::PullParser.new( source ) + + events = {} + element_name = '' + while parser.has_next? + event = parser.pull + case event.event_type + when :start_element + element_name = event[0] + when :text + events[element_name] = event[1] + end + end + + assert_equal('A', events['a']) + assert_equal("B\n", events['b']) + assert_equal("C\n", events['c']) + end + def test_peek_unshift source = "" REXML::Parsers::PullParser.new(source) @@ -98,5 +155,101 @@ def test_peek end assert_equal( 0, names.length ) end + + class EntityExpansionLimitTest < Test::Unit::TestCase + def setup + @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + end + + def teardown + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + end + + class GeneralEntityTest < self + def test_have_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("entity expansion has grown too large")) do + while parser.has_next? + parser.pull + end + end + end + + def test_empty_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + while parser.has_next? + parser.pull + end + end + + REXML::Security.entity_expansion_limit = 100 + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + while parser.has_next? + parser.pull + end + end + assert_equal(101, parser.entity_expansion_count) + end + + def test_with_default_entity + source = <<-XML + + + +]> + +&a; +&a2; +< + + XML + + REXML::Security.entity_expansion_limit = 4 + parser = REXML::Parsers::PullParser.new(source) + while parser.has_next? + parser.pull + end + + REXML::Security.entity_expansion_limit = 3 + parser = REXML::Parsers::PullParser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + while parser.has_next? + parser.pull + end + end + end + end + end end end diff --git a/test/test_sax.rb b/test/test_sax.rb index 6f775183..5e3ad75b 100644 --- a/test/test_sax.rb +++ b/test/test_sax.rb @@ -31,6 +31,17 @@ def test_entity_replacement assert_equal '--1234--', results[1] end + def test_characters_predefined_entities + source = '<P> <I> <B> Text </B> </I>' + + sax = Parsers::SAX2Parser.new( source ) + results = [] + sax.listen(:characters) {|x| results << x } + sax.parse + + assert_equal(["

Text "], results) + end + def test_sax2 File.open(fixture_path("documentation.xml")) do |f| parser = Parsers::SAX2Parser.new( f ) @@ -88,6 +99,92 @@ def test_sax2 end end + class EntityExpansionLimitTest < Test::Unit::TestCase + def setup + @default_entity_expansion_limit = REXML::Security.entity_expansion_limit + end + + def teardown + REXML::Security.entity_expansion_limit = @default_entity_expansion_limit + end + + class GeneralEntityTest < self + def test_have_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("entity expansion has grown too large")) do + sax.parse + end + end + + def test_empty_value + source = <<-XML + + + + + + +]> + +&a; + + XML + + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + sax.parse + end + + REXML::Security.entity_expansion_limit = 100 + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + sax.parse + end + assert_equal(101, sax.entity_expansion_count) + end + + def test_with_default_entity + source = <<-XML + + + +]> + +&a; +&a2; +< + + XML + + REXML::Security.entity_expansion_limit = 4 + sax = REXML::Parsers::SAX2Parser.new(source) + sax.parse + + REXML::Security.entity_expansion_limit = 3 + sax = REXML::Parsers::SAX2Parser.new(source) + assert_raise(RuntimeError.new("number of entity expansions exceeded, processing aborted.")) do + sax.parse + end + end + end + end + # used by test_simple_doctype_listener # submitted by Jeff Barczewski class SimpleDoctypeListener @@ -109,7 +206,7 @@ def doctype(name, pub_sys, long_name, uri) # test simple non-entity doctype in sax listener # submitted by Jeff Barczewski def test_simple_doctype_listener - xml = <<-END + xml = <<~END Hello, world! @@ -140,8 +237,8 @@ def test_simple_doctype_listener # test doctype with missing name, should throw ParseException # submitted by Jeff Barczewseki - def test_doctype_with_mising_name_throws_exception - xml = <<-END + def test_doctype_with_missing_name_throws_exception + xml = <<~END Hello, world! diff --git a/test/test_text_check.rb b/test/test_text_check.rb new file mode 100644 index 00000000..11cf65a3 --- /dev/null +++ b/test/test_text_check.rb @@ -0,0 +1,121 @@ +# frozen_string_literal: false + +module REXMLTests + class TextCheckTester < Test::Unit::TestCase + + def check(string) + REXML::Text.check(string, REXML::Text::NEEDS_A_SECOND_CHECK, nil) + end + + def assert_check(string) + assert_nothing_raised { check(string) } + end + + def assert_check_failed(string, illegal_part) + message = "Illegal character #{illegal_part.inspect} in raw string #{string.inspect}" + assert_raise(RuntimeError.new(message)) do + check(string) + end + end + + class TestValid < self + def test_entity_name_start_char_colon + assert_check("&:;") + end + + def test_entity_name_start_char_under_score + assert_check("&_;") + end + + def test_entity_name_mix + assert_check("&A.b-0123;") + end + + def test_character_reference_decimal + assert_check("¢") + end + + def test_character_reference_hex + assert_check("􏿿") + end + + def test_entity_name_non_ascii + # U+3042 HIRAGANA LETTER A + # U+3044 HIRAGANA LETTER I + assert_check("&\u3042\u3044;") + end + + def test_normal_string + assert_check("foo") + end + end + + class TestInvalid < self + def test_lt + assert_check_failed("<;", "<") + end + + def test_lt_mix + assert_check_failed("ab @@ -24,7 +24,7 @@ def test_validate - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) @@ -33,7 +33,7 @@ def test_validate def test_sequence - rng = %q{ + rng = <<-XML @@ -45,7 +45,7 @@ def test_sequence - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -56,7 +56,7 @@ def test_sequence def test_choice - rng = %q{ + rng = <<-XML @@ -70,7 +70,7 @@ def test_choice - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -79,7 +79,7 @@ def test_choice end def test_optional - rng = %q{ + rng = <<-XML @@ -90,7 +90,7 @@ def test_optional - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) @@ -100,7 +100,7 @@ def test_optional end def test_zero_or_more - rng = %q{ + rng = <<-XML @@ -111,7 +111,7 @@ def test_zero_or_more - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) no_error( validator, %q{} ) @@ -119,7 +119,7 @@ def test_zero_or_more error( validator, %q{} ) error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -133,7 +133,7 @@ def test_zero_or_more - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) @@ -143,7 +143,7 @@ def test_zero_or_more end def test_one_or_more - rng = %q{ + rng = <<-XML @@ -154,7 +154,7 @@ def test_one_or_more - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -165,13 +165,13 @@ def test_one_or_more end def test_attribute - rng = %q{ + rng = <<-XML - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -181,7 +181,7 @@ def test_attribute end def test_choice_attributes - rng = %q{ + rng = <<-XML @@ -189,7 +189,7 @@ def test_choice_attributes - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -199,7 +199,7 @@ def test_choice_attributes end def test_choice_attribute_element - rng = %q{ + rng = <<-XML @@ -207,7 +207,7 @@ def test_choice_attribute_element - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -217,12 +217,12 @@ def test_choice_attribute_element end def test_empty - rng = %q{ + rng = <<-XML - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -231,12 +231,12 @@ def test_empty end def test_text_val - rng = %q{ + rng = <<-XML - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -245,7 +245,7 @@ def test_text_val end def test_choice_text - rng = %q{ + rng = <<-XML @@ -253,7 +253,7 @@ def test_choice_text - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{Text} ) @@ -263,7 +263,7 @@ def test_choice_text end def test_group - rng = %q{ + rng = <<-XML @@ -274,7 +274,7 @@ def test_group - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -282,7 +282,7 @@ def test_group no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -291,7 +291,7 @@ def test_group - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -302,14 +302,14 @@ def test_group def test_value # Values as text nodes - rng = %q{ + rng = <<-XML VaLuE - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{X} ) @@ -317,7 +317,7 @@ def test_value no_error( validator, %q{VaLuE} ) # Values as text nodes, via choice - rng = %q{ + rng = <<-XML @@ -327,7 +327,7 @@ def test_value - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -336,14 +336,14 @@ def test_value no_error( validator, %q{Option 2} ) # Attribute values - rng = %q{ + rng = <<-XML VaLuE - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -352,7 +352,7 @@ def test_value no_error( validator, %q{} ) # Attribute values via choice - rng = %q{ + rng = <<-XML @@ -362,7 +362,7 @@ def test_value - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -372,7 +372,7 @@ def test_value end def test_interleave - rng = %q{ + rng = <<-XML @@ -383,7 +383,7 @@ def test_interleave - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -396,7 +396,7 @@ def test_interleave end def test_mixed - rng = %q{ + rng = <<-XML @@ -405,7 +405,7 @@ def test_mixed - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{Text} ) @@ -413,7 +413,7 @@ def test_mixed end def test_ref_sequence - rng = %q{ + rng = <<-XML @@ -429,7 +429,7 @@ def test_ref_sequence - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) @@ -437,7 +437,7 @@ def test_ref_sequence end def test_ref_choice - rng = %q{ + rng = <<-XML @@ -453,7 +453,7 @@ def test_ref_choice - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -461,7 +461,7 @@ def test_ref_choice no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -477,7 +477,7 @@ def test_ref_choice - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -485,7 +485,7 @@ def test_ref_choice no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -502,7 +502,7 @@ def test_ref_choice - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -513,7 +513,7 @@ def test_ref_choice def test_ref_zero_plus - rng = %q{ + rng = <<-XML @@ -530,7 +530,7 @@ def test_ref_zero_plus - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -538,7 +538,7 @@ def test_ref_zero_plus no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -555,7 +555,7 @@ def test_ref_zero_plus - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -566,7 +566,7 @@ def test_ref_zero_plus def test_ref_one_plus - rng = %q{ + rng = <<-XML @@ -583,7 +583,7 @@ def test_ref_one_plus - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -591,7 +591,7 @@ def test_ref_one_plus no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -608,7 +608,7 @@ def test_ref_one_plus - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -618,7 +618,7 @@ def test_ref_one_plus end def test_ref_interleave - rng = %q{ + rng = <<-XML @@ -634,7 +634,7 @@ def test_ref_interleave - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -643,7 +643,7 @@ def test_ref_interleave no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -659,7 +659,7 @@ def test_ref_interleave - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -668,7 +668,7 @@ def test_ref_interleave no_error( validator, %q{} ) no_error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -687,7 +687,7 @@ def test_ref_interleave - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -698,7 +698,7 @@ def test_ref_interleave end def test_ref_recurse - rng = %q{ + rng = <<-XML @@ -715,7 +715,7 @@ def test_ref_recurse - } + XML validator = REXML::Validation::RelaxNG.new( rng ) error( validator, %q{} ) @@ -724,7 +724,7 @@ def test_ref_recurse end def test_ref_optional - rng = %q{ + rng = <<-XML @@ -740,7 +740,7 @@ def test_ref_optional - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) @@ -748,7 +748,7 @@ def test_ref_optional error( validator, %q{} ) error( validator, %q{} ) - rng = %q{ + rng = <<-XML @@ -764,7 +764,7 @@ def test_ref_optional - } + XML validator = REXML::Validation::RelaxNG.new( rng ) no_error( validator, %q{} ) diff --git a/test/test_xml_declaration.rb b/test/test_xml_declaration.rb index 6db54bab..6a1f4df0 100644 --- a/test/test_xml_declaration.rb +++ b/test/test_xml_declaration.rb @@ -6,7 +6,7 @@ module REXMLTests class TestXmlDeclaration < Test::Unit::TestCase def setup - xml = <<-XML + xml = <<~XML diff --git a/test/xpath/test_base.rb b/test/xpath/test_base.rb index 5156bbbe..1dacd69d 100644 --- a/test/xpath/test_base.rb +++ b/test/xpath/test_base.rb @@ -451,6 +451,46 @@ def test_following # puts results #end + def test_nested_predicates + doc = Document.new <<-EOF +

+
+ ab + cd +
+
+ ef + gh +
+
+ hi +
+
+ EOF + + matches = XPath.match(doc, '(/div/div/test[0])').map(&:text) + assert_equal [], matches + matches = XPath.match(doc, '(/div/div/test[1])').map(&:text) + assert_equal ["ab", "ef", "hi"], matches + matches = XPath.match(doc, '(/div/div/test[2])').map(&:text) + assert_equal ["cd", "gh"], matches + matches = XPath.match(doc, '(/div/div/test[3])').map(&:text) + assert_equal [], matches + + matches = XPath.match(doc, '(/div/div/test[1])[1]').map(&:text) + assert_equal ["ab"], matches + matches = XPath.match(doc, '(/div/div/test[1])[2]').map(&:text) + assert_equal ["ef"], matches + matches = XPath.match(doc, '(/div/div/test[1])[3]').map(&:text) + assert_equal ["hi"], matches + matches = XPath.match(doc, '(/div/div/test[2])[1]').map(&:text) + assert_equal ["cd"], matches + matches = XPath.match(doc, '(/div/div/test[2])[2]').map(&:text) + assert_equal ["gh"], matches + matches = XPath.match(doc, '(/div/div/test[2])[3]').map(&:text) + assert_equal [], matches + end + # Contributed by Mike Stok def test_starts_with source = <<-EOF @@ -611,7 +651,7 @@ def test_comparisons source = "" doc = REXML::Document.new(source) - # NOTE TO SER: check that number() is required + # NOTE: check that number() is required assert_equal 2, REXML::XPath.match(doc, "//b[number(@id) > 1]").size assert_equal 3, REXML::XPath.match(doc, "//b[number(@id) >= 1]").size assert_equal 1, REXML::XPath.match(doc, "//b[number(@id) <= 1]").size diff --git a/test/xpath/test_predicate.rb b/test/xpath/test_predicate.rb index c8520712..278e3765 100644 --- a/test/xpath/test_predicate.rb +++ b/test/xpath/test_predicate.rb @@ -6,7 +6,7 @@ module REXMLTests class TestXPathPredicate < Test::Unit::TestCase include REXML - SRC=<<-EOL + SRC=<<~EOL
free flowing text. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy