From d1c2bdf6398c733e119635a628f623bdbc0e6765 Mon Sep 17 00:00:00 2001 From: olabusayoT <50379531+olabusayoT@users.noreply.github.com> Date: Thu, 7 May 2026 13:29:46 -0400 Subject: [PATCH 1/3] Add `infosetWalkerMode` tunable for streaming and non-streaming modes - Introduced `infosetWalkerMode` tunable in `dafext.xsd` with `streaming` and `nonStreaming` options. - Updated parser logic to handle mode selection, optimizing traversal for simpler schemas with `nonStreaming` and complex schemas with `streaming`. - Added TDML tests to validate the new tunable functionality. Deprecation/Compatibility The infoset walker mode has been changed to non-streaming behavior as the default. To change to previous behavior, set the tunable infosetWalkerMode=streaming. DAFFODIL-3070 --- .../daffodil/cli/cliTest/TestCLIParsing.scala | 35 ++++++------ .../runtime1/infoset/InfosetImpl.scala | 57 ++++++++++++++++++- .../runtime1/processors/DataProcessor.scala | 15 +++-- .../parsers/SequenceParserBases.scala | 13 ----- .../core/infoset/TestInfosetFree.scala | 1 + .../org/apache/daffodil/xsd/dafext.xsd | 18 ++++++ .../section00/general/infosetWalker.tdml | 27 ++++++++- .../daffodil/section13/nillable/nillable.tdml | 1 + .../section00/general/TestInfosetWalker.scala | 2 + 9 files changed, 132 insertions(+), 37 deletions(-) diff --git a/daffodil-cli/src/test/scala/org/apache/daffodil/cli/cliTest/TestCLIParsing.scala b/daffodil-cli/src/test/scala/org/apache/daffodil/cli/cliTest/TestCLIParsing.scala index eeb2148811..17d8b99de8 100644 --- a/daffodil-cli/src/test/scala/org/apache/daffodil/cli/cliTest/TestCLIParsing.scala +++ b/daffodil-cli/src/test/scala/org/apache/daffodil/cli/cliTest/TestCLIParsing.scala @@ -1018,23 +1018,24 @@ class TestCLIParsing { "daffodil-cli/src/test/resources/org/apache/daffodil/cli/cli_schema_05.dfdl.xsd" ) - runCLI(args"parse -s $schema -I jdom -TinfosetWalkerSkipMin=0 -TinfosetWalkerSkipMax=0") { - cli => - // this is not enough data for the scema, which leads to a parse error about insufficient bits - cli.sendBytes(Array[Byte](0, 0, 0, 1), inputDone = true) - - // there was a bug Daffodil that is most easily observed using the jdom infoset outputter - // with a non skipping infoset walker. With this setup, when an element fails to parse - // inside a choice dispatch (and no surrounding points of uncertainty) the infoset walker - // could walk into the failed element, which leads to an SDE when using the JDOM infoset - // outputter. This SDE prevents backtracking so we do not see a diagnostic about the - // choice dispatch branch failing. If the bug is fixed, we should never walk into the - // invalid element, we should not get an SDE, and we should get a diagnostic about choice - // dispatch. - cli.expectErr("Parse Error: Choice dispatch branch failed") - - // this is the core failure diagnostic, which we see regardless of bug - cli.expectErr("Parse Error: Insufficient bits in data.") + runCLI( + args"parse -s $schema -I jdom -TinfosetWalkerMode=streaming -TinfosetWalkerSkipMin=0 -TinfosetWalkerSkipMax=0" + ) { cli => + // this is not enough data for the scema, which leads to a parse error about insufficient bits + cli.sendBytes(Array[Byte](0, 0, 0, 1), inputDone = true) + + // there was a bug Daffodil that is most easily observed using the jdom infoset outputter + // with a non skipping infoset walker. With this setup, when an element fails to parse + // inside a choice dispatch (and no surrounding points of uncertainty) the infoset walker + // could walk into the failed element, which leads to an SDE when using the JDOM infoset + // outputter. This SDE prevents backtracking so we do not see a diagnostic about the + // choice dispatch branch failing. If the bug is fixed, we should never walk into the + // invalid element, we should not get an SDE, and we should get a diagnostic about choice + // dispatch. + cli.expectErr("Parse Error: Choice dispatch branch failed") + + // this is the core failure diagnostic, which we see regardless of bug + cli.expectErr("Parse Error: Insufficient bits in data.") }(ExitCode.ParseError) } } diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetImpl.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetImpl.scala index 8b1741f110..87e96c4e4e 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetImpl.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetImpl.scala @@ -32,10 +32,12 @@ import java.util.concurrent.atomic.AtomicInteger import java.util.concurrent.atomic.AtomicLong import scala.collection.mutable.ArrayBuffer +import org.apache.daffodil.api import org.apache.daffodil.api.infoset.InfosetArray import org.apache.daffodil.api.infoset.InfosetComplexElement import org.apache.daffodil.api.infoset.InfosetDocument import org.apache.daffodil.api.infoset.InfosetElement +import org.apache.daffodil.api.infoset.InfosetOutputter import org.apache.daffodil.api.infoset.InfosetSimpleElement import org.apache.daffodil.api.infoset.InfosetTypeException import org.apache.daffodil.api.metadata.ComplexElementMetadata @@ -48,6 +50,7 @@ import org.apache.daffodil.lib.equality.TypeEqual import org.apache.daffodil.lib.equality.ViewEqual import org.apache.daffodil.lib.exceptions.Assert import org.apache.daffodil.lib.exceptions.ThinException +import org.apache.daffodil.lib.exceptions.ThrowsSDE import org.apache.daffodil.lib.iapi.DaffodilTunables import org.apache.daffodil.lib.iapi.Diagnostic import org.apache.daffodil.lib.iapi.ThinDiagnostic @@ -193,6 +196,24 @@ sealed trait DINode { * Array or Complex exception. */ def requireFinal(): Unit + + def walk(outputter: api.infoset.InfosetOutputter): Unit + + protected def doOutputter(outputterFunc: => Unit, desc: String, context: ThrowsSDE): Unit = { + try { + outputterFunc + } catch { + case e: Exception => { + // FIXME: DAFFODIL-2884 This escalates a parser data exception to an SDE + // Which breaks if string-as-xml encounters a string that is malformed XML. + // We get the error thrown by the xml parser here outside of parsing, which is + // too late. + val cause = e.getCause + val msg = if (cause == null) e.toString else cause.toString + context.SDE("Failed to %s: %s", desc, msg) + } + } + } } /** @@ -1313,6 +1334,16 @@ final class DIArray( } } } + + override def walk(outputter: InfosetOutputter): Unit = { + if (!isHidden) { + doOutputter(outputter.startArray(this), "start infoset array", erd) + _contents.foreach { child => + child.walk(outputter) + } + doOutputter(outputter.endArray(this), "end infoset array", erd) + } + } } /** @@ -1666,6 +1697,13 @@ sealed class DISimple(override val erd: ElementRuntimeData) } override def getObject: Object = getAnyRef + + override def walk(outputter: api.infoset.InfosetOutputter): Unit = { + if (!isHidden) { + doOutputter(outputter.startSimple(this), "start infoset simple element", erd) + doOutputter(outputter.endSimple(this), "end infoset simple element", erd) + } + } } /** @@ -1710,7 +1748,7 @@ sealed class DIComplex(override val erd: ElementRuntimeData) if (!isFinal) throw nfe } - private val childNodes = new ArrayBuffer[DINode] + protected val childNodes = new ArrayBuffer[DINode] private lazy val nameToChildNodeLookup = new java.util.HashMap[NamedQName, ArrayBuffer[DINode]] @@ -2008,6 +2046,15 @@ sealed class DIComplex(override val erd: ElementRuntimeData) } } + override def walk(outputter: InfosetOutputter): Unit = { + if (!isHidden) { + doOutputter(outputter.startComplex(this), "start infoset complex element", erd) + childNodes.foreach { child => + child.walk(outputter) + } + doOutputter(outputter.endComplex(this), "end infoset complex element", erd) + } + } } /* @@ -2022,6 +2069,14 @@ final class DIDocument(erd: ElementRuntimeData) extends DIComplex(erd) with Info * a constant value */ var isCompileExprFalseRoot: Boolean = false + + override def walk(outputter: InfosetOutputter): Unit = { + doOutputter(outputter.startDocument(), "start infoset document", erd) + childNodes.foreach { child => + child.walk(outputter) + } + doOutputter(outputter.endDocument(), "end infoset document", erd) + } } object Infoset { diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/DataProcessor.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/DataProcessor.scala index 3b16d3680e..c20a4a0733 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/DataProcessor.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/DataProcessor.scala @@ -39,6 +39,7 @@ import org.apache.daffodil.api.validation.ValidatorInitializationException import org.apache.daffodil.api.validation.Validators import org.apache.daffodil.lib.equality.* import org.apache.daffodil.lib.iapi.DaffodilTunables +import org.apache.daffodil.lib.iapi.InfosetWalkerMode import org.apache.daffodil.lib.iapi.WithDiagnostics import org.apache.daffodil.runtime1.dsom.* import org.apache.daffodil.runtime1.iapi.DFDL @@ -386,11 +387,15 @@ class DataProcessor( state.setMaybeProcessor(Maybe(p)) if (state.processorStatus == Success) { - // At this point all infoset nodes have been set final, all PoUs - // resolved, and all infoset walker blocks released. Do one last walk - // to project any unwalked elements to the target infoset - state.walker.walk(lastWalk = true) - Assert.invariant(state.walker.isFinished) + if (tunables.infosetWalkerMode == InfosetWalkerMode.NonStreaming) { + state.infoset.walk(state.output) + } else { + // At this point all infoset nodes have been set final, all PoUs + // resolved, and all infoset walker blocks released. Do one last walk + // to project any unwalked elements to the target infoset + state.walker.walk(lastWalk = true) + Assert.invariant(state.walker.isFinished) + } } } catch { // We will actually be handling all errors in the outer loop diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/SequenceParserBases.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/SequenceParserBases.scala index a0366cf812..ba9a8d8c4e 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/SequenceParserBases.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/SequenceParserBases.scala @@ -212,11 +212,6 @@ abstract class SequenceParserBase( // should not increment the group index. pstate.mpstate.moveOverOneGroupIndexOnly() } - - // we might have added a new instance to the array. Attempt to project it to an - // infoset if there are no PoU's or anything blocking it - pstate.walker.walk() - } // end while for each repeat parser.endArray(pstate) parser.arrayCompleteChecks(pstate, resultOfTry, priorResultOfTry) @@ -312,10 +307,6 @@ abstract class SequenceParserBase( } // end case scalarParser } // end match case parser - // we finished parsing one whole thing (scalar element, entire array, etc). Attempt to - // project it to an infoset if there are no PoU's or anything blocking it - pstate.walker.walk() - scpIndex += 1 } // end while for each sequence child parser @@ -330,10 +321,6 @@ abstract class SequenceParserBase( // that we incremented above. This will allow the infoset walker to walk // into the new children that are now in the correct order. pstate.infoset.infosetWalkerBlockCount -= 1 - - // we've unblocked the unordered sequence, try walking to output - // everything we've created - pstate.walker.walk() } if (child ne null) child.sequenceCompleteChecks(pstate, resultOfTry, priorResultOfTry) diff --git a/daffodil-core/src/test/scala/org/apache/daffodil/core/infoset/TestInfosetFree.scala b/daffodil-core/src/test/scala/org/apache/daffodil/core/infoset/TestInfosetFree.scala index b475caf5da..dd6e3bf50d 100644 --- a/daffodil-core/src/test/scala/org/apache/daffodil/core/infoset/TestInfosetFree.scala +++ b/daffodil-core/src/test/scala/org/apache/daffodil/core/infoset/TestInfosetFree.scala @@ -53,6 +53,7 @@ object TestInfosetFree { val compiler = Compiler() .withTunable("releaseUnneededInfoset", "false") + .withTunable("infosetWalkerMode", "streaming") val pf = compiler.compileNode(schema) if (pf.isError) { diff --git a/daffodil-propgen/src/main/resources/org/apache/daffodil/xsd/dafext.xsd b/daffodil-propgen/src/main/resources/org/apache/daffodil/xsd/dafext.xsd index c349278019..93188467c6 100644 --- a/daffodil-propgen/src/main/resources/org/apache/daffodil/xsd/dafext.xsd +++ b/daffodil-propgen/src/main/resources/org/apache/daffodil/xsd/dafext.xsd @@ -239,6 +239,17 @@ + + + + Daffodil can periodically walk the internal infoset to send events to the configured + InfosetOutputter (streaming) or it can walk the internal infoset once at the end of + parsing (nonStreaming). The idea being that simple schemas would benefit from the + nonStreaming infoset walker, while more complex schemas with lots of points of + uncertaintly would benefit from the streaming infoset walker. + + + @@ -780,6 +791,13 @@ + + + + + + + diff --git a/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/infosetWalker.tdml b/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/infosetWalker.tdml index b99353dd81..d4b37e1d8c 100644 --- a/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/infosetWalker.tdml +++ b/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/infosetWalker.tdml @@ -30,6 +30,7 @@ --> + streaming 0 @@ -41,11 +42,19 @@ --> + streaming 0 false + + + + streaming + + + @@ -65,7 +74,7 @@ - + |header;body1;body2;body3;| @@ -113,5 +122,21 @@ + + + + |header;body1;body2;body3;| + + + + + header + body1 + body2 + body3 + + + + diff --git a/daffodil-test/src/test/resources/org/apache/daffodil/section13/nillable/nillable.tdml b/daffodil-test/src/test/resources/org/apache/daffodil/section13/nillable/nillable.tdml index a0603e777f..e13cb34a7d 100644 --- a/daffodil-test/src/test/resources/org/apache/daffodil/section13/nillable/nillable.tdml +++ b/daffodil-test/src/test/resources/org/apache/daffodil/section13/nillable/nillable.tdml @@ -384,6 +384,7 @@ + streaming 0 diff --git a/daffodil-test/src/test/scala/org/apache/daffodil/section00/general/TestInfosetWalker.scala b/daffodil-test/src/test/scala/org/apache/daffodil/section00/general/TestInfosetWalker.scala index 1396a3ac9d..37dd9c4d0b 100644 --- a/daffodil-test/src/test/scala/org/apache/daffodil/section00/general/TestInfosetWalker.scala +++ b/daffodil-test/src/test/scala/org/apache/daffodil/section00/general/TestInfosetWalker.scala @@ -33,4 +33,6 @@ class TestInfosetWalker extends TdmlTests { // DAFFODIL-2755 @Test def infosetWalker_02 = test @Test def infosetWalker_03 = test + // DAFFODIL-3070 + @Test def infosetWalker_04 = test } From 06f3427aaf47c448f18362b712f6c2eb06eda2c7 Mon Sep 17 00:00:00 2001 From: olabusayoT <50379531+olabusayoT@users.noreply.github.com> Date: Tue, 2 Jun 2026 16:36:07 -0400 Subject: [PATCH 2/3] fixup! Add `infosetWalkerMode` tunable for streaming and non-streaming modes - add comments - clarify when non-streaming vs streaming mode is ideal - add back periodic InfosetWalker.walk() calls Deprecation/Compatibility The infoset walker mode has been changed to non-streaming behavior as the default, as this can cuase significant performance improvements in some instances. To change to previous behavior, set the tunable infosetWalkerMode=streaming. If the infoset is likely to be very large or if memory is constrained, steaming mode would be more beneficial, otherwise in most other cases, non-streaming mode will be faster or the same. DAFFODIL-3070 --- .../runtime1/infoset/InfosetImpl.scala | 5 +++++ .../parsers/SequenceParserBases.scala | 18 ++++++++++++++++++ .../org/apache/daffodil/xsd/dafext.xsd | 17 ++++++++++------- 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetImpl.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetImpl.scala index 87e96c4e4e..4d57e90673 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetImpl.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetImpl.scala @@ -197,6 +197,11 @@ sealed trait DINode { */ def requireFinal(): Unit + /** + * This is an alternative to using the InfosetWalker.walk(). The two are incompatible, + * therefore, a single parse must either call InfosetWalker.walk() or DINode.walk(), + * but never both. + */ def walk(outputter: api.infoset.InfosetOutputter): Unit protected def doOutputter(outputterFunc: => Unit, desc: String, context: ThrowsSDE): Unit = { diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/SequenceParserBases.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/SequenceParserBases.scala index ba9a8d8c4e..36c56cbfc7 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/SequenceParserBases.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/SequenceParserBases.scala @@ -17,6 +17,7 @@ package org.apache.daffodil.runtime1.processors.parsers import org.apache.daffodil.lib.exceptions.Assert +import org.apache.daffodil.lib.iapi.InfosetWalkerMode import org.apache.daffodil.lib.util.Maybe import org.apache.daffodil.lib.util.Maybe.Nope import org.apache.daffodil.lib.util.Maybe.One @@ -212,6 +213,11 @@ abstract class SequenceParserBase( // should not increment the group index. pstate.mpstate.moveOverOneGroupIndexOnly() } + if (pstate.tunable.infosetWalkerMode == InfosetWalkerMode.Streaming) { + // we might have added a new instance to the array. Attempt to project it to an + // infoset if there are no PoU's or anything blocking it + pstate.walker.walk() + } } // end while for each repeat parser.endArray(pstate) parser.arrayCompleteChecks(pstate, resultOfTry, priorResultOfTry) @@ -307,6 +313,12 @@ abstract class SequenceParserBase( } // end case scalarParser } // end match case parser + if (pstate.tunable.infosetWalkerMode == InfosetWalkerMode.Streaming) { + // we finished parsing one whole thing (scalar element, entire array, etc). Attempt to + // project it to an infoset if there are no PoU's or anything blocking it + pstate.walker.walk() + } + scpIndex += 1 } // end while for each sequence child parser @@ -321,6 +333,12 @@ abstract class SequenceParserBase( // that we incremented above. This will allow the infoset walker to walk // into the new children that are now in the correct order. pstate.infoset.infosetWalkerBlockCount -= 1 + + if (pstate.tunable.infosetWalkerMode == InfosetWalkerMode.Streaming) { + // we've unblocked the unordered sequence, try walking to output + // everything we've created + pstate.walker.walk() + } } if (child ne null) child.sequenceCompleteChecks(pstate, resultOfTry, priorResultOfTry) diff --git a/daffodil-propgen/src/main/resources/org/apache/daffodil/xsd/dafext.xsd b/daffodil-propgen/src/main/resources/org/apache/daffodil/xsd/dafext.xsd index 93188467c6..a66c2be638 100644 --- a/daffodil-propgen/src/main/resources/org/apache/daffodil/xsd/dafext.xsd +++ b/daffodil-propgen/src/main/resources/org/apache/daffodil/xsd/dafext.xsd @@ -244,17 +244,19 @@ Daffodil can periodically walk the internal infoset to send events to the configured InfosetOutputter (streaming) or it can walk the internal infoset once at the end of - parsing (nonStreaming). The idea being that simple schemas would benefit from the - nonStreaming infoset walker, while more complex schemas with lots of points of - uncertaintly would benefit from the streaming infoset walker. + parsing (nonStreaming). The idea being that simple schemas or schemas with lots of + points of uncertainty would benefit from the nonStreaming infoset walker, while + very large schemas or situations where memory is contrained would benefit + from the streaming infoset walker. - Daffodil periodically walks the internal infoset to send events to the configured - InfosetOutputter, skipping at least this number of walk attempts. Larger values + If infosetWalkerMode is "streaming", Daffodil periodically walks the + internal infoset to send events to the configured InfosetOutputter, + skipping at least this number of walk attempts. Larger values mean delayed InfosetOutputter events and more memory usage; Smaller values mean more CPU usage. Set this value to zero to never skip any walk attempts. This is specifically for advanced testing behavior and should not need to be changed by users. @@ -269,8 +271,9 @@ - Daffodil periodically walks the internal infoset to send events to the configured - InfosetOutputter. On walks where no progress is made, the number of walks to skip + If infosetWalkerMode is "streaming", Daffodil periodically walks the internal + infoset to send events to the configured InfosetOutputter. On walks where + no progress is made, the number of walks to skip is increased with the assumption that something is blocking it (like an unresolved point of uncertainty), up to this maximum value. Higher values mean less attempts are made when blocked for a long time, but with potentially more From e3712168d16a4bf6682f96d9cd1c4db06d686c6e Mon Sep 17 00:00:00 2001 From: olabusayoT <50379531+olabusayoT@users.noreply.github.com> Date: Fri, 5 Jun 2026 17:24:51 -0400 Subject: [PATCH 3/3] fixup! fixup! Add `infosetWalkerMode` tunable for streaming and non-streaming modes - Extract InfosetWalker trait; rename former InfosetWalker class to StreamingInfosetWalker; introduce NonStreamingInfosetWalker backed by DINode.walk() for the non-streaming path - Remove infosetWalkerMode conditionals in SequenceParserBases and DataProcessor now that the walker handles mode selection internally - Update DaffodilDebugger to always use StreamingInfosetWalker since debugger performance is not a huge priortiy Deprecation/Compatibility The infoset walker mode has been changed to non-streaming behavior as the default, as this can cuase significant performance improvements in some instances. To change to previous behavior, set the tunable infosetWalkerMode=streaming. If the infoset is likely to be very large or if memory is constrained, steaming mode would be more beneficial, otherwise in most other cases, non-streaming mode will be faster or the same. DAFFODIL-3070 --- .../runtime1/debugger/DaffodilDebugger.scala | 2 +- .../runtime1/infoset/InfosetImpl.scala | 10 +- .../runtime1/infoset/InfosetWalker.scala | 95 +++++++++++++++++-- .../runtime1/processors/DataProcessor.scala | 15 +-- .../runtime1/processors/parsers/PState.scala | 25 +++-- .../parsers/SequenceParserBases.scala | 26 ++--- .../core/infoset/TestInfosetFree.scala | 2 +- 7 files changed, 125 insertions(+), 50 deletions(-) diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/debugger/DaffodilDebugger.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/debugger/DaffodilDebugger.scala index 7ca5daf959..c5d53cdfc7 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/debugger/DaffodilDebugger.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/debugger/DaffodilDebugger.scala @@ -471,7 +471,7 @@ class DaffodilDebugger( private def infosetToString(ie: InfosetElement): String = { val bos = new java.io.ByteArrayOutputStream() val xml = new XMLTextInfosetOutputter(bos, pretty = true, minimal = true) - val iw = InfosetWalker( + val iw = StreamingInfosetWalker( ie.asInstanceOf[DIElement], xml, walkHidden = !DebuggerConfig.removeHidden, diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetImpl.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetImpl.scala index 4d57e90673..5edb5196e9 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetImpl.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetImpl.scala @@ -198,9 +198,13 @@ sealed trait DINode { def requireFinal(): Unit /** - * This is an alternative to using the InfosetWalker.walk(). The two are incompatible, - * therefore, a single parse must either call InfosetWalker.walk() or DINode.walk(), - * but never both. + * Eagerly walk the entire subtree rooted at this node, emitting + * start/end events to `outputter` in document order. Hidden nodes are + * skipped. The walk is complete and blocking — all events for this node + * and its descendants are emitted before the method returns. + * + * Used by [[NonStreamingInfosetWalker]] to project the whole infoset in + * one pass after parsing is finished. */ def walk(outputter: api.infoset.InfosetOutputter): Unit diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetWalker.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetWalker.scala index c46b6bce61..2906e9dd37 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetWalker.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/infoset/InfosetWalker.scala @@ -23,7 +23,85 @@ import org.apache.daffodil.lib.exceptions.ThrowsSDE import org.apache.daffodil.lib.util.MStackOf import org.apache.daffodil.lib.util.MStackOfInt -object InfosetWalker { +/** + * Walks Daffodil's internal infoset representation (DINodes) and emits + * start/end events to an [[api.infoset.InfosetOutputter]], which projects the + * infoset to the caller's desired format (XML, JSON, SAX, etc.). + * + * Two concrete implementations exist, selectable via the `infosetWalkerMode` + * tunable: + * + * - [[StreamingInfosetWalker]] (`infosetWalkerMode = "streaming"`): emits events + * incrementally as elements are finalized during parsing. Keeps memory usage + * bounded for large or deeply-nested infosets, but incurs overhead from + * repeated speculative walk attempts. + * + * - [[NonStreamingInfosetWalker]] (`infosetWalkerMode = "nonStreaming"`, default): + * defers all output until the entire infoset is available, then walks it in + * one pass. Faster for schemas where the infoset fits comfortably in memory, + * because it avoids the overhead of incremental walk attempts. + * + * Callers invoke [[walk]] periodically during parsing. When `lastWalk = true` + * the walker must flush any remaining events before returning. [[isFinished]] + * returns `true` once the entire infoset has been walked. + */ +trait InfosetWalker { + + /** + * The outputter to which events are written. + */ + def outputter: api.infoset.InfosetOutputter + + /** + * Returns `true` once the entire infoset has been walked and all events have + * been emitted. Calling [[walk]] after this is an error. + */ + def isFinished: Boolean + + /** + * Take zero or more steps in the infoset, emitting events to [[outputter]]. + * + * A single call is not guaranteed to walk the entire infoset in some + * implementations, as the walker may pause (e.g. because parsing has + * not yet finalized the next element). In those instances, the caller should + * invoke this periodically and check [[isFinished]]. + * + * @param lastWalk `true` if this is the final call; the walker must emit all + * remaining events before returning. + */ + def walk(lastWalk: Boolean = false): Unit +} + +/** + * An [[InfosetWalker]] that defers all output until the parse is complete, + * then walks the entire infoset in a single pass when `walk(lastWalk = true)` + * is called. Intermediate `walk()` calls are no-ops. + * + * This is the default walker (tunable `infosetWalkerMode = "nonStreaming"`). + * It is faster than [[StreamingInfosetWalker]] for most schemas because it + * avoids the overhead of repeated speculative walk attempts, at the cost of + * holding the full infoset in memory until parsing finishes. For very large + * infosets or memory-constrained environments, prefer [[StreamingInfosetWalker]]. + * + * @param root The root [[DIElement]] of the infoset to walk. + * @param outputter The [[api.infoset.InfosetOutputter]] that receives events. + */ +class NonStreamingInfosetWalker(root: DIElement, val outputter: api.infoset.InfosetOutputter) + extends InfosetWalker { + + private var finished: Boolean = false + + override def isFinished: Boolean = finished + + def walk(lastWalk: Boolean = false): Unit = { + if (lastWalk) { + root.walk(outputter) + finished = true + } + } +} + +object StreamingInfosetWalker { /** * Create an infoset walker starting with a specified DINode. If the caller @@ -79,7 +157,7 @@ object InfosetWalker { releaseUnneededInfoset: Boolean, walkSkipMin: Int = 32, walkSkipMax: Int = 2048 - ): InfosetWalker = { + ): StreamingInfosetWalker = { // Determine the container of the root node and the index in which it // appears in that node @@ -99,7 +177,7 @@ object InfosetWalker { (container, container.indexOf(root)) } } - new InfosetWalker( + new StreamingInfosetWalker( startingContainerNode, startingContainerIndex, outputter, @@ -173,7 +251,7 @@ object InfosetWalker { * and increases the number of walk() calls to skip before trying again. This * defines the maximum number of skiped calls, even as this number increases. */ -class InfosetWalker private ( +class StreamingInfosetWalker private ( startingContainerNode: DINode, startingContainerIndex: Int, val outputter: api.infoset.InfosetOutputter, @@ -182,7 +260,7 @@ class InfosetWalker private ( releaseUnneededInfoset: Boolean, walkSkipMin: Int, walkSkipMax: Int -) { +) extends InfosetWalker { /** * These two pieces of mutable state are all that is needed to keep track of @@ -227,10 +305,7 @@ class InfosetWalker private ( private var finished = false - /** - * Determine if the walker has finished walking. - */ - def isFinished = finished + override def isFinished = finished /** * The following variables are used to determine when to skip the walk() @@ -269,7 +344,7 @@ class InfosetWalker private ( * walk() will be called, the lastWalk parameter should be set to true, which * will cause walk() to not skip any steps. */ - def walk(lastWalk: Boolean = false): Unit = { + override def walk(lastWalk: Boolean = false): Unit = { Assert.usage(!finished) if (walkSkipRemaining > 0 && !lastWalk) { diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/DataProcessor.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/DataProcessor.scala index c20a4a0733..3b16d3680e 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/DataProcessor.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/DataProcessor.scala @@ -39,7 +39,6 @@ import org.apache.daffodil.api.validation.ValidatorInitializationException import org.apache.daffodil.api.validation.Validators import org.apache.daffodil.lib.equality.* import org.apache.daffodil.lib.iapi.DaffodilTunables -import org.apache.daffodil.lib.iapi.InfosetWalkerMode import org.apache.daffodil.lib.iapi.WithDiagnostics import org.apache.daffodil.runtime1.dsom.* import org.apache.daffodil.runtime1.iapi.DFDL @@ -387,15 +386,11 @@ class DataProcessor( state.setMaybeProcessor(Maybe(p)) if (state.processorStatus == Success) { - if (tunables.infosetWalkerMode == InfosetWalkerMode.NonStreaming) { - state.infoset.walk(state.output) - } else { - // At this point all infoset nodes have been set final, all PoUs - // resolved, and all infoset walker blocks released. Do one last walk - // to project any unwalked elements to the target infoset - state.walker.walk(lastWalk = true) - Assert.invariant(state.walker.isFinished) - } + // At this point all infoset nodes have been set final, all PoUs + // resolved, and all infoset walker blocks released. Do one last walk + // to project any unwalked elements to the target infoset + state.walker.walk(lastWalk = true) + Assert.invariant(state.walker.isFinished) } } catch { // We will actually be handling all errors in the outer loop diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/PState.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/PState.scala index fa64d628bb..521dc2d1f3 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/PState.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/PState.scala @@ -32,6 +32,7 @@ import org.apache.daffodil.lib.exceptions.Abort import org.apache.daffodil.lib.exceptions.Assert import org.apache.daffodil.lib.exceptions.ThrowsSDE import org.apache.daffodil.lib.iapi.DaffodilTunables +import org.apache.daffodil.lib.iapi.InfosetWalkerMode import org.apache.daffodil.lib.util.MStack import org.apache.daffodil.lib.util.MStackOf import org.apache.daffodil.lib.util.MStackOfInt @@ -52,6 +53,8 @@ import org.apache.daffodil.runtime1.infoset.DISimpleState import org.apache.daffodil.runtime1.infoset.DataValue.DataValuePrimitive import org.apache.daffodil.runtime1.infoset.Infoset import org.apache.daffodil.runtime1.infoset.InfosetWalker +import org.apache.daffodil.runtime1.infoset.NonStreamingInfosetWalker +import org.apache.daffodil.runtime1.infoset.StreamingInfosetWalker import org.apache.daffodil.runtime1.processors.DataLoc import org.apache.daffodil.runtime1.processors.DataProcessor import org.apache.daffodil.runtime1.processors.ElementRuntimeData @@ -750,15 +753,19 @@ object PState { val diagnostics = Nil val mutablePState = MPState() val tunables = dataProc.tunables - val infosetWalker = InfosetWalker( - doc.asInstanceOf[DIElement], - output, - walkHidden = false, - ignoreBlocks = false, - releaseUnneededInfoset = !areDebugging && tunables.releaseUnneededInfoset, - walkSkipMin = tunables.infosetWalkerSkipMin, - walkSkipMax = tunables.infosetWalkerSkipMax - ) + val infosetWalker = if (tunables.infosetWalkerMode == InfosetWalkerMode.Streaming) { + StreamingInfosetWalker( + doc.asInstanceOf[DIElement], + output, + walkHidden = false, + ignoreBlocks = false, + releaseUnneededInfoset = !areDebugging && tunables.releaseUnneededInfoset, + walkSkipMin = tunables.infosetWalkerSkipMin, + walkSkipMax = tunables.infosetWalkerSkipMax + ) + } else { + new NonStreamingInfosetWalker(doc.asInstanceOf[DIElement], output) + } dis.cst.setPriorBitOrder(root.defaultBitOrder) val newState = new PState( diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/SequenceParserBases.scala b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/SequenceParserBases.scala index 36c56cbfc7..4b774a403e 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/SequenceParserBases.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/runtime1/processors/parsers/SequenceParserBases.scala @@ -17,7 +17,6 @@ package org.apache.daffodil.runtime1.processors.parsers import org.apache.daffodil.lib.exceptions.Assert -import org.apache.daffodil.lib.iapi.InfosetWalkerMode import org.apache.daffodil.lib.util.Maybe import org.apache.daffodil.lib.util.Maybe.Nope import org.apache.daffodil.lib.util.Maybe.One @@ -213,11 +212,10 @@ abstract class SequenceParserBase( // should not increment the group index. pstate.mpstate.moveOverOneGroupIndexOnly() } - if (pstate.tunable.infosetWalkerMode == InfosetWalkerMode.Streaming) { - // we might have added a new instance to the array. Attempt to project it to an - // infoset if there are no PoU's or anything blocking it - pstate.walker.walk() - } + // we might have added a new instance to the array. Attempt to project it to an + // infoset if there are no PoU's or anything blocking it + pstate.walker.walk() + } // end while for each repeat parser.endArray(pstate) parser.arrayCompleteChecks(pstate, resultOfTry, priorResultOfTry) @@ -313,11 +311,9 @@ abstract class SequenceParserBase( } // end case scalarParser } // end match case parser - if (pstate.tunable.infosetWalkerMode == InfosetWalkerMode.Streaming) { - // we finished parsing one whole thing (scalar element, entire array, etc). Attempt to - // project it to an infoset if there are no PoU's or anything blocking it - pstate.walker.walk() - } + // we finished parsing one whole thing (scalar element, entire array, etc). Attempt to + // project it to an infoset if there are no PoU's or anything blocking it + pstate.walker.walk() scpIndex += 1 @@ -334,11 +330,9 @@ abstract class SequenceParserBase( // into the new children that are now in the correct order. pstate.infoset.infosetWalkerBlockCount -= 1 - if (pstate.tunable.infosetWalkerMode == InfosetWalkerMode.Streaming) { - // we've unblocked the unordered sequence, try walking to output - // everything we've created - pstate.walker.walk() - } + // we've unblocked the unordered sequence, try walking to output + // everything we've created + pstate.walker.walk() } if (child ne null) child.sequenceCompleteChecks(pstate, resultOfTry, priorResultOfTry) diff --git a/daffodil-core/src/test/scala/org/apache/daffodil/core/infoset/TestInfosetFree.scala b/daffodil-core/src/test/scala/org/apache/daffodil/core/infoset/TestInfosetFree.scala index dd6e3bf50d..4b1aa416dc 100644 --- a/daffodil-core/src/test/scala/org/apache/daffodil/core/infoset/TestInfosetFree.scala +++ b/daffodil-core/src/test/scala/org/apache/daffodil/core/infoset/TestInfosetFree.scala @@ -93,7 +93,7 @@ object TestInfosetFree { val detailedOutputter = new ScalaXMLInfosetOutputter(showFreedInfo = true) - val infosetWalker = InfosetWalker( + val infosetWalker = StreamingInfosetWalker( doc, detailedOutputter, walkHidden = true, // let's ensure any hidden elements are free