Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/engine/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ func run() error {
DB: pool,
Storage: store,
LLM: llmClient,
Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Ingest.Tables)),
Parsers: ingest.RegistryFromIngestParams(tableOptsFromConfig(cfg.Ingest.Tables), cfg.Ingest.MaxSections, time.Duration(cfg.Ingest.ParseTimeoutSeconds)*time.Second),
Logger: logger,
Mode: cfg.Ingest.Mode,
HyDEEnabled: cfg.Ingest.HyDE.Enabled,
Expand Down
2 changes: 1 addition & 1 deletion cmd/server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ func run() error {
DB: pool,
Storage: store,
LLM: llmClient,
Parsers: ingest.RegistryFromTableOpts(tableOptsFromConfig(cfg.Engine.Ingest.Tables)),
Parsers: ingest.RegistryFromIngestParams(tableOptsFromConfig(cfg.Engine.Ingest.Tables), cfg.Engine.Ingest.MaxSections, time.Duration(cfg.Engine.Ingest.ParseTimeoutSeconds)*time.Second),
Logger: logger,
Mode: cfg.Engine.Ingest.Mode,
HyDEEnabled: cfg.Engine.Ingest.HyDE.Enabled,
Expand Down
33 changes: 33 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,39 @@ ingest:
# vectorless-server use VLS_INGEST_MODE=minimal (no secret edit needed).
mode: "full"

# Total-parse timeout (seconds). Bounds the ENTIRE parse of one
# document end to end — row extraction, table extraction, section
# building, and the leaf-section cap. It is the outermost robustness
# valve: a pathological/malformed PDF (observed: a 10-K stuck 600s+ in
# `parsing`, even in minimal mode, inside pure-Go row extraction) is
# abandoned at the deadline and the document fails fast instead of
# wedging the pipeline forever. NOTHING is disabled by this bound — the
# full feature set (LLM TOC, tables, summarize, HyDE, multi-axis) still
# runs; parse is merely time-boxed. Applies in BOTH full and minimal
# mode (parse runs in both).
#
# 120 is comfortably longer than a healthy 300-page filing's parse
# (seconds to low tens of seconds) yet short enough to reap a hang
# quickly. 0 uses the engine default (120). Override per-process with
# VLE_INGEST_PARSE_TIMEOUT_SECONDS; the deployed server also honours
# VLS_INGEST_PARSE_TIMEOUT_SECONDS.
parse_timeout_seconds: 120

# Cap on the number of leaf sections one document may produce. A
# pathological PDF (e.g. a 90-page 10-K whose every bold statement
# title trips the heading detector, or a heading→one-body-leaf chain
# repeated hundreds of times) can shatter into far more leaves than the
# document has real sections — each leaf then costs a summarize + HyDE
# + multi-axis LLM call, which is what throttles/stalls full ingest.
# When the parsed leaf count exceeds this cap the parser merges
# sections (smallest first; single-leaf parents collapse into their
# parent) until the document is back under the cap, preserving content
# and never merging table sections. 400 sits comfortably above a real
# filing's section count while still catching the runaway case. 0 uses
# the engine default (400); a negative value disables the cap. Override
# with VLE_INGEST_MAX_SECTIONS.
max_sections: 400

# The summarize and HyDE stages run concurrently. This caps the total
# number of LLM calls in flight across both stages combined, so the
# provider's per-tenant concurrency limit isn't exceeded. 0 disables
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ require (
github.com/go-chi/chi/v5 v5.2.5
github.com/google/uuid v1.6.0
github.com/hallelx2/llmgate v0.2.0
github.com/hallelx2/pdftable v0.3.0
github.com/hallelx2/pdftable v0.3.1
github.com/hibiken/asynq v0.26.0
github.com/jackc/pgx/v5 v5.9.2
github.com/ledongthuc/pdf v0.0.0-20250511090121-5959a4027728
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1 h1:X5VWvz21y3gzm9Nw/kaUeku/1+u
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1/go.mod h1:Zanoh4+gvIgluNqcfMVTJueD4wSS5hT7zTt4Mrutd90=
github.com/hallelx2/llmgate v0.2.0 h1:x/LNCeHUPZpafn2IXi+LqpnZa7TtEQdLVlpkkJTlzBI=
github.com/hallelx2/llmgate v0.2.0/go.mod h1:MK2Ol/5CIweTQ2/9eSiTJ5g/KSSuobNZL9TD3s57JxY=
github.com/hallelx2/pdftable v0.3.0 h1:SwZPu2z4cIR4R30gP+7bpunGh931StjO1vrsxoldiDw=
github.com/hallelx2/pdftable v0.3.0/go.mod h1:pxNlc4D43wjzis7M6EfgQZvHOsQ4okggm+xqUu+OokI=
github.com/hallelx2/pdftable v0.3.1 h1:Uqe+9G8s9jrGYwxk8dEMXBCB+SlzvWPmW0Ze5863W1I=
github.com/hallelx2/pdftable v0.3.1/go.mod h1:pxNlc4D43wjzis7M6EfgQZvHOsQ4okggm+xqUu+OokI=
github.com/hhrutter/lzw v1.0.0 h1:laL89Llp86W3rRs83LvKbwYRx6INE8gDn0XNb1oXtm0=
github.com/hhrutter/lzw v1.0.0/go.mod h1:2HC6DJSn/n6iAZfgM3Pg+cP1KxeWc3ezG8bBqW5+WEo=
github.com/hhrutter/pkcs7 v0.2.2 h1:xMoifoVWah1LNym3C0pomEiLmyJyVIBXt/8oTPyPz+8=
Expand Down
11 changes: 10 additions & 1 deletion internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ func Default() Config {
MaxAge: 86400,
},
Governance: GovernanceConfig{
MaxBodySizeBytes: 33554432, // 32 MiB
MaxBodySizeBytes: 33554432, // 32 MiB
DefaultTimeout: 30 * time.Second,
QueryTimeout: 120 * time.Second,
},
Expand Down Expand Up @@ -326,6 +326,15 @@ func applyEnvOverrides(c *Config) {
if v := firstEnv("VLS_INGEST_MODE", "VLE_INGEST_MODE"); v != "" {
c.Engine.Ingest.Mode = v
}
// Total-parse timeout (seconds). Forwarded so the deployed server
// honours a tuned parse deadline without a secret/config edit — the
// outermost robustness valve against a parse that hangs (pre-LLM,
// pure-Go row extraction). VLS_-prefixed wins over VLE_.
if v := firstEnv("VLS_INGEST_PARSE_TIMEOUT_SECONDS", "VLE_INGEST_PARSE_TIMEOUT_SECONDS"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n >= 0 {
c.Engine.Ingest.ParseTimeoutSeconds = n
}
}
// Anthropic-compatible gateway overrides (e.g. GLM/Zhipu via
// https://api.z.ai/api/anthropic): base URL + model, so the
// anthropic driver can run a non-Anthropic model without a secret
Expand Down
31 changes: 31 additions & 0 deletions pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,28 @@ type IngestConfig struct {
// section count (~170-510 with tables) while still catching the
// runaway case. A negative value is rejected by Validate.
MaxSections int `yaml:"max_sections"`

// ParseTimeoutSeconds bounds the ENTIRE parse of a single document —
// row extraction, table extraction, section building, and the leaf
// cap, end to end. It is the outermost robustness valve: every
// per-stage timeout inside the parser (per-page / doc-wide table
// budgets) is bounded by something pre-LLM, but pure-Go row extraction
// (ledongthuc's reader.Page(n).Content()) had no bound, so a
// pathological PDF (observed: a 10-K stuck 600s+ in `parsing` even in
// minimal mode) could hang the parse forever.
//
// When the whole parse exceeds this deadline the parser abandons the
// work and returns a clear error; the ingest pipeline treats it like
// any other parse failure (the document goes to `failed`), so a doc
// that can't parse in time fails fast and is visible to ops/bench
// rather than wedging the pipeline. NOTHING is disabled — the full
// feature set (LLM TOC, tables, summarize, HyDE, multi-axis) still
// runs; parse is merely bounded.
//
// 0 (or omitted) defaults to 120. A negative value is rejected by
// Validate. Engine env override: VLE_INGEST_PARSE_TIMEOUT_SECONDS;
// the server binary also forwards VLS_/VLE_INGEST_PARSE_TIMEOUT_SECONDS.
ParseTimeoutSeconds int `yaml:"parse_timeout_seconds"`
}

// TOCBlock configures the LLM-driven table-of-contents tree
Expand Down Expand Up @@ -728,6 +750,7 @@ func Default() Config {
GlobalLLMConcurrency: 12,
LLMCallTimeoutSeconds: 90,
MaxSections: 400,
ParseTimeoutSeconds: 120,
HyDE: HyDEConfig{
Enabled: true,
NumQuestions: 5,
Expand Down Expand Up @@ -911,6 +934,11 @@ func applyEnvOverrides(c *Config) {
c.Ingest.MaxSections = n
}
}
if v := os.Getenv("VLE_INGEST_PARSE_TIMEOUT_SECONDS"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n >= 0 {
c.Ingest.ParseTimeoutSeconds = n
}
}
// pdftable-driven table extraction.
if v := os.Getenv("VLE_INGEST_TABLES_ENABLED"); v != "" {
switch strings.ToLower(strings.TrimSpace(v)) {
Expand Down Expand Up @@ -1200,6 +1228,9 @@ func (c Config) Validate() error {
if c.Ingest.MaxSections < 0 {
return fmt.Errorf("ingest.max_sections must be >= 0, got %d", c.Ingest.MaxSections)
}
if c.Ingest.ParseTimeoutSeconds < 0 {
return fmt.Errorf("ingest.parse_timeout_seconds must be >= 0, got %d", c.Ingest.ParseTimeoutSeconds)
}

switch c.Ingest.Tables.VerticalStrategy {
case "", "lines", "lines_strict", "text", "explicit":
Expand Down
32 changes: 32 additions & 0 deletions pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,38 @@ func TestDefaultValues(t *testing.T) {
if cfg.Ingest.TOC.TOCCheckPages != 20 {
t.Errorf("ingest.toc.toc_check_pages = %d, want 20", cfg.Ingest.TOC.TOCCheckPages)
}
if cfg.Ingest.ParseTimeoutSeconds != 120 {
t.Errorf("ingest.parse_timeout_seconds = %d, want 120", cfg.Ingest.ParseTimeoutSeconds)
}
if cfg.Ingest.MaxSections != 400 {
t.Errorf("ingest.max_sections = %d, want 400", cfg.Ingest.MaxSections)
}
}

// TestIngestParseTimeoutEnvOverride covers VLE_INGEST_PARSE_TIMEOUT_SECONDS
// — the operator knob that lets a tuned whole-parse deadline reach the
// parser without a config-file edit.
func TestIngestParseTimeoutEnvOverride(t *testing.T) {
prev := os.Getenv("VLE_INGEST_PARSE_TIMEOUT_SECONDS")
defer os.Setenv("VLE_INGEST_PARSE_TIMEOUT_SECONDS", prev)

os.Setenv("VLE_INGEST_PARSE_TIMEOUT_SECONDS", "300")
cfg := Default()
applyEnvOverrides(&cfg)
if cfg.Ingest.ParseTimeoutSeconds != 300 {
t.Errorf("ingest.parse_timeout_seconds = %d, want 300", cfg.Ingest.ParseTimeoutSeconds)
}
}

// TestIngestParseTimeoutValidate rejects a negative parse timeout — a
// non-positive deadline would silently disable the bound, which must be
// an explicit choice, not a typo that slips through Load.
func TestIngestParseTimeoutValidate(t *testing.T) {
cfg := Default()
cfg.Ingest.ParseTimeoutSeconds = -1
if err := cfg.Validate(); err == nil {
t.Error("Validate should reject a negative ingest.parse_timeout_seconds")
}
}

// TestIngestModeDefault locks the default ingest mode to "full" so the
Expand Down
28 changes: 24 additions & 4 deletions pkg/ingest/ingest.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@
// stop retrying immediately on a timeout: re-issuing a call that just hung
// would only multiply the wall-time cost (N retries × the timeout) without
// changing the outcome, so a timeout is terminal, not retryable.
func isTimeout(err error) bool {

Check failure on line 302 in pkg/ingest/ingest.go

View workflow job for this annotation

GitHub Actions / lint

func isTimeout is unused (U1000)
return errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled)
}

Expand Down Expand Up @@ -1122,15 +1122,35 @@
}

// RegistryFromTableOpts returns a parser.Registry where the PDF parser
// is configured from the supplied TableOpts. Pass nil to disable table
// extraction entirely; pass parser.DefaultTableOpts() (or a custom set)
// to enable. All non-PDF parsers are constructed at their defaults.
// is configured from the supplied TableOpts, with the leaf-section cap
// and total-parse timeout left at their parser defaults (400 sections,
// 120s). Pass nil to disable table extraction entirely; pass
// parser.DefaultTableOpts() (or a custom set) to enable. All non-PDF
// parsers are constructed at their defaults.
//
// Use RegistryFromIngestParams to thread an operator-tuned cap / parse
// timeout from config.
func RegistryFromTableOpts(opts *parser.TableOpts) *parser.Registry {
return RegistryFromIngestParams(opts, 0, 0)
}

// RegistryFromIngestParams returns a parser.Registry where the PDF parser
// is configured from the supplied TableOpts AND the operator-tuned
// leaf-section cap and total-parse timeout (ingest.max_sections,
// ingest.parse_timeout_seconds). maxSections == 0 / parseTimeout == 0
// each select the parser's built-in default; a negative value disables
// that bound. All non-PDF parsers are constructed at their defaults.
//
// This is the constructor the engine/server wiring uses so the parse
// deadline and section cap from config actually reach the parser — the
// outermost robustness valves for full-feature ingest. Table extraction,
// the section tree, and the cap all still run; they are merely bounded.
func RegistryFromIngestParams(opts *parser.TableOpts, maxSections int, parseTimeout time.Duration) *parser.Registry {
return parser.NewRegistry(
parser.NewMarkdown(),
parser.NewHTML(),
parser.NewDOCX(),
parser.NewPDFWithTables(opts),
parser.NewPDFWithConfig(opts, maxSections, parseTimeout),
parser.NewText(),
)
}
Expand Down
Loading
Loading