From a3207d2014c77cfacbee456a82ae00289b4ea0e7 Mon Sep 17 00:00:00 2001 From: Flook Peter Date: Sat, 14 Feb 2026 18:16:11 +1100 Subject: [PATCH 1/3] docs: Update version to 0.19.1, enhance foreign key features, and improve documentation - Bump version in various files to 0.19.1. - Introduce cardinality and nullability controls for foreign key relationships. - Add new documentation for foreign key enhancements and update existing links. - Improve logging levels for better debugging. - Update Dockerfile and example configurations to reflect the new version. - Add feature catalog for standardized documentation of all features. --- .gitignore | 1 + README.md | 2 +- app/build.gradle.kts | 1 + .../plan/ForeignKeyUniquenessProcessor.scala | 2 +- .../datacaterer/core/plan/PlanProcessor.scala | 20 +- .../sample/plan/customer-create-plan.yaml | 3 + .../task/file/csv-transaction-task.yaml | 2 +- docs/docs/generator/foreign-key.md | 535 +- docs/docs/generator/transformation.md | 2 +- docs/get-started/quick-start.md | 4 +- docs/index.md | 2 +- docs/migrations/README.md | 2 +- .../yaml-unified-format/MIGRATION.md | 4 +- docs/use-case/changelog/0.15.0.md | 2 +- docs/use-case/changelog/0.16.10.md | 4 +- docs/use-case/changelog/0.16.2.md | 2 +- docs/use-case/changelog/0.16.3.md | 2 +- docs/use-case/changelog/0.16.4.md | 2 +- docs/use-case/changelog/0.16.5.md | 2 +- docs/use-case/changelog/0.16.8.md | 2 +- docs/use-case/changelog/0.19.1.md | 37 + docs/use-case/roadmap.md | 12 +- example/Dockerfile | 2 +- gradle.properties | 2 +- misc/feature-catalog/README.md | 105 + .../docs/categories/advanced.md | 306 + .../docs/categories/configuration.md | 605 ++ .../docs/categories/connectors.md | 486 ++ .../docs/categories/generation.md | 1191 ++++ .../docs/categories/metadata.md | 121 + .../feature-catalog/docs/categories/ui-api.md | 95 + .../docs/categories/validation.md | 900 +++ .../docs/comparison-with-lite.md | 344 + misc/feature-catalog/docs/index.md | 217 + misc/feature-catalog/features.json | 6168 +++++++++++++++++ .../schema/feature-metadata-schema.json | 175 + .../scripts/extract_features.py | 1739 +++++ .../scripts/generate_markdown.py | 395 ++ misc/feature-catalog/scripts/utils.py | 263 + mkdocs.yml | 3 + 40 files changed, 13722 insertions(+), 40 deletions(-) create mode 100644 docs/use-case/changelog/0.19.1.md create mode 100644 misc/feature-catalog/README.md create mode 100644 misc/feature-catalog/docs/categories/advanced.md create mode 100644 misc/feature-catalog/docs/categories/configuration.md create mode 100644 misc/feature-catalog/docs/categories/connectors.md create mode 100644 misc/feature-catalog/docs/categories/generation.md create mode 100644 misc/feature-catalog/docs/categories/metadata.md create mode 100644 misc/feature-catalog/docs/categories/ui-api.md create mode 100644 misc/feature-catalog/docs/categories/validation.md create mode 100644 misc/feature-catalog/docs/comparison-with-lite.md create mode 100644 misc/feature-catalog/docs/index.md create mode 100644 misc/feature-catalog/features.json create mode 100644 misc/feature-catalog/schema/feature-metadata-schema.json create mode 100644 misc/feature-catalog/scripts/extract_features.py create mode 100644 misc/feature-catalog/scripts/generate_markdown.py create mode 100644 misc/feature-catalog/scripts/utils.py diff --git a/.gitignore b/.gitignore index 6311f5ec..c18d69c5 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ site # Python/virtualenvs used by docs .venv .python-version +__pycache__ app/docs app/out diff --git a/README.md b/README.md index 66ce7228..ae52c3f8 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ Check results at `docker/data/custom/report/index.html`. ### UI ```shell -docker run -d -p 9898:9898 -e DEPLOY_MODE=standalone --name datacaterer datacatering/data-caterer:0.19.0 +docker run -d -p 9898:9898 -e DEPLOY_MODE=standalone --name datacaterer datacatering/data-caterer:0.19.1 ``` Open [http://localhost:9898](http://localhost:9898). diff --git a/app/build.gradle.kts b/app/build.gradle.kts index 701eb11e..36437780 100644 --- a/app/build.gradle.kts +++ b/app/build.gradle.kts @@ -543,6 +543,7 @@ tasks.shadowJar { val newTransformer = com.github.jengelman.gradle.plugins.shadow.transformers.AppendingTransformer() newTransformer.resource = "reference.conf" transformers.add(newTransformer) + mergeServiceFiles() } // Configure Scoverage only when it's applied (configuration cache disabled) diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/plan/ForeignKeyUniquenessProcessor.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/plan/ForeignKeyUniquenessProcessor.scala index 044f1469..32390570 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/plan/ForeignKeyUniquenessProcessor.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/plan/ForeignKeyUniquenessProcessor.scala @@ -40,7 +40,7 @@ class ForeignKeyUniquenessProcessor(val dataCatererConfiguration: DataCatererCon validations: List[ValidationConfiguration] ): (Plan, List[Task], List[ValidationConfiguration]) = { - LOGGER.info("ForeignKeyUniquenessProcessor starting...") + LOGGER.debug("ForeignKeyUniquenessProcessor starting...") // Extract foreign keys from plan's sink options val foreignKeys = plan.sinkOptions.map(_.foreignKeys).getOrElse(List()) diff --git a/app/src/main/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessor.scala b/app/src/main/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessor.scala index bfcd7ef3..602c8c1d 100644 --- a/app/src/main/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessor.scala +++ b/app/src/main/scala/io/github/datacatering/datacaterer/core/plan/PlanProcessor.scala @@ -1,7 +1,7 @@ package io.github.datacatering.datacaterer.core.plan import io.github.datacatering.datacaterer.api.PlanRun -import io.github.datacatering.datacaterer.api.model.Constants.{DATA_CATERER_INTERFACE_JAVA, DATA_CATERER_INTERFACE_SCALA, DATA_CATERER_INTERFACE_YAML, PLAN_CLASS, PLAN_STAGE_EXTRACT_METADATA, PLAN_STAGE_PARSE_PLAN} +import io.github.datacatering.datacaterer.api.model.Constants.{DATA_CATERER_INTERFACE_JAVA, DATA_CATERER_INTERFACE_SCALA, DATA_CATERER_INTERFACE_YAML, DEFAULT_STEP_TYPE, FORMAT, PLAN_CLASS, PLAN_STAGE_EXTRACT_METADATA, PLAN_STAGE_PARSE_PLAN} import io.github.datacatering.datacaterer.api.model.{DataCatererConfiguration, Plan, Task, ValidationConfiguration} import io.github.datacatering.datacaterer.core.activity.{PlanRunPostPlanProcessor, PlanRunPrePlanProcessor} import io.github.datacatering.datacaterer.core.config.ConfigParser @@ -118,7 +118,7 @@ object PlanProcessor { basePlan, baseTasks, baseValidations, dataCatererConfiguration, resolvedInterface ) - LOGGER.info(s"After pre-processors: num-tasks=${finalTasks.size}") + LOGGER.info(s"After pre-processors: num-tasks=${finalTasks.size}, task-names=${finalTasks.map(_.name).mkString(", ")}") // Step 4: Generate data with the final modified plan/tasks val dataGeneratorProcessor = new DataGeneratorProcessor(dataCatererConfiguration) @@ -358,7 +358,13 @@ class YamlPlanRun( // Merge connection config into each step's options (connection config as base, step options override) val stepsWithConnectionConfig = task.steps.map(step => { - step.copy(options = connectionConfig ++ step.options) + val mergedOptions = connectionConfig ++ step.options + val optionsWithFormat = if (!mergedOptions.contains(FORMAT) && step.`type` != DEFAULT_STEP_TYPE) { + mergedOptions + (FORMAT -> step.`type`) + } else { + mergedOptions + } + step.copy(options = optionsWithFormat) }) task.copy(steps = stepsWithConnectionConfig) @@ -386,7 +392,13 @@ class UnifiedPlanRun( val connectionConfig = dataCatererConfig.connectionConfigByName.getOrElse(dataSourceName, Map()) val stepsWithConnectionConfig = task.steps.map(step => { - step.copy(options = connectionConfig ++ step.options) + val mergedOptions = connectionConfig ++ step.options + val optionsWithFormat = if (!mergedOptions.contains(FORMAT) && step.`type` != DEFAULT_STEP_TYPE) { + mergedOptions + (FORMAT -> step.`type`) + } else { + mergedOptions + } + step.copy(options = optionsWithFormat) }) task.copy(steps = stepsWithConnectionConfig) diff --git a/app/src/test/resources/sample/plan/customer-create-plan.yaml b/app/src/test/resources/sample/plan/customer-create-plan.yaml index 53dd84b4..8ca66544 100644 --- a/app/src/test/resources/sample/plan/customer-create-plan.yaml +++ b/app/src/test/resources/sample/plan/customer-create-plan.yaml @@ -15,4 +15,7 @@ tasks: enabled: false - name: "json_schema_file" dataSourceName: "json" + enabled: false + - name: "csv_transaction_file" + dataSourceName: "csv" enabled: true diff --git a/app/src/test/resources/sample/task/file/csv-transaction-task.yaml b/app/src/test/resources/sample/task/file/csv-transaction-task.yaml index 8eaf3588..a07f0972 100644 --- a/app/src/test/resources/sample/task/file/csv-transaction-task.yaml +++ b/app/src/test/resources/sample/task/file/csv-transaction-task.yaml @@ -38,4 +38,4 @@ steps: - name: "rank" type: "int" options: - oneOf: ["1->0.8", "2->0.1", "3->0.1"] + oneOf: [1, 2, 3] diff --git a/docs/docs/generator/foreign-key.md b/docs/docs/generator/foreign-key.md index b5ab273d..ce7a6d4f 100644 --- a/docs/docs/generator/foreign-key.md +++ b/docs/docs/generator/foreign-key.md @@ -13,7 +13,7 @@ particular fields. ## Single field -Define a field in one data source to match against another field. +Define a field in one data source to match against another field. Below example shows a `postgres` data source with two tables, `accounts` and `transactions` that have a foreign key for `account_id`. @@ -34,7 +34,7 @@ for `account_id`. field().name("full_name"), ... ); - + plan().addForeignKeyRelationship( postgresAcc, "account_id", List.of(Map.entry(postgresTxn, "account_id")) @@ -129,7 +129,7 @@ and `name` from `accounts` to match with `account_id` and `full_name` to match i field().name("full_name"), ... ); - + plan().addForeignKeyRelationship( postgresAcc, List.of("account_id", "name"), List.of(Map.entry(postgresTxn, List.of("account_id", "full_name"))) @@ -227,7 +227,7 @@ data source contains `account_id` which is a concatenation of `ACC` with `accoun field().name("account_number").omit(true) //using this field for intermediate calculation, not included in final result with omit=true ... ); - + plan().addForeignKeyRelationship( postgresAcc, List.of("account_number"), List.of(Map.entry(jsonTask, List.of("account_number"))) @@ -312,7 +312,7 @@ data source contains `account_id` which is a concatenation of `ACC` with `accoun Your schema structure can have nested fields which can also be referenced as foreign keys. But to do so, you need to create a proxy field that gets omitted from the final saved data. - + In the example below, the nested `customer_details.name` field inside the `json` task needs to match with `name` from `postgres`. A new field in the `json` called `_txn_name` is used as a temporary field to facilitate the foreign key definition. @@ -337,7 +337,7 @@ key definition. ), field().name("_txn_name").omit(true) //value will not be included in output ); - + plan().addForeignKeyRelationship( postgresAcc, List.of("account_id", "name"), List.of(Map.entry(jsonTask, List.of("account_id", "_txn_name"))) @@ -361,7 +361,7 @@ key definition. .fields( field.name("name").sql("_txn_name") //nested field will get value from '_txn_name' ... - ), + ), field.name("_txn_name").omit(true) //value will not be included in output ) @@ -428,9 +428,493 @@ key definition. - "_txn_name" ``` +## Cardinality (one-to-many) + +By default, when a foreign key is defined without cardinality configuration, Data Caterer applies a **1:1 mapping** +where the child record count is adjusted to match the parent record count. To generate multiple child records per +parent (e.g. 5 orders per customer), you must configure cardinality on the target relation. + +### Default behavior (1:1) + +Without cardinality, the child count is adjusted to equal the parent count, regardless of the `records` value +you set on the child step. + +``` +customers (3 records) orders (3 records, adjusted from requested 20) ++-------------+----------+ +----------+-------------+--------+ +| customer_id | name | | order_id | customer_id | amount | ++-------------+----------+ +----------+-------------+--------+ +| CUST001 | Alice | | ORD001 | CUST001 | 150.00 | +| CUST002 | Bob | | ORD002 | CUST002 | 200.00 | +| CUST003 | Carol | | ORD003 | CUST003 | 75.00 | ++-------------+----------+ +----------+-------------+--------+ +``` + +### Fixed ratio + +Use `ratio` to specify an exact number of child records per parent. For example, `ratio: 5.0` generates +exactly 5 orders per customer. + +``` +customers (3 records) orders (15 records = 3 x 5) ++-------------+----------+ +----------+-------------+--------+ +| customer_id | name | | order_id | customer_id | amount | ++-------------+----------+ +----------+-------------+--------+ +| CUST001 | Alice | | ORD001 | CUST001 | 150.00 | +| CUST002 | Bob | | ORD002 | CUST001 | 200.00 | +| CUST003 | Carol | | ORD003 | CUST001 | 75.00 | ++-------------+----------+ | ORD004 | CUST001 | 300.00 | + | ORD005 | CUST001 | 50.00 | + | ORD006 | CUST002 | 120.00 | + | ORD007 | CUST002 | 80.00 | + | ORD008 | CUST002 | 210.00 | + | ORD009 | CUST002 | 95.00 | + | ORD010 | CUST002 | 175.00 | + | ORD011 | CUST003 | 60.00 | + | ORD012 | CUST003 | 340.00 | + | ORD013 | CUST003 | 110.00 | + | ORD014 | CUST003 | 225.00 | + | ORD015 | CUST003 | 190.00 | + +----------+-------------+--------+ +``` + +=== "Java" + + ```java + plan().addForeignKeyRelationship( + postgresCustomers, "customer_id", + List.of(Map.entry(postgresOrders, "customer_id")), + new CardinalityConfig().ratio(5.0) + ); + ``` + +=== "Scala" + + ```scala + plan.addForeignKeyRelationship( + postgresCustomers, "customer_id", + List(postgresOrders -> "customer_id"), + CardinalityConfig(ratio = Some(5.0)) + ) + ``` + +=== "YAML" + + ```yaml + foreignKeys: + - source: + dataSource: "my_postgres" + step: "customers" + fields: ["customer_id"] + generate: + - dataSource: "my_postgres" + step: "orders" + fields: ["customer_id"] + cardinality: + ratio: 5.0 + ``` + +### Bounded range (min/max) + +Use `min` and `max` to generate a variable number of child records per parent. Each parent will get between +`min` and `max` children. + +``` +customers (3 records) orders (variable, 2-4 per customer) ++-------------+----------+ +----------+-------------+--------+ +| customer_id | name | | order_id | customer_id | amount | ++-------------+----------+ +----------+-------------+--------+ +| CUST001 | Alice | | ORD001 | CUST001 | 150.00 | +| CUST002 | Bob | | ORD002 | CUST001 | 200.00 | +| CUST003 | Carol | | ORD003 | CUST001 | 75.00 | ++-------------+----------+ | ORD004 | CUST002 | 300.00 | + | ORD005 | CUST002 | 50.00 | + | ORD006 | CUST003 | 120.00 | + | ORD007 | CUST003 | 80.00 | + | ORD008 | CUST003 | 210.00 | + | ORD009 | CUST003 | 95.00 | + +----------+-------------+--------+ +``` + + +=== "Java" + + ```java + plan().addForeignKeyRelationship( + postgresCustomers, "customer_id", + List.of(Map.entry(postgresOrders, "customer_id")), + new CardinalityConfig().min(2).max(4) + ); + ``` + +=== "Scala" + + ```scala + plan.addForeignKeyRelationship( + postgresCustomers, "customer_id", + List(postgresOrders -> "customer_id"), + CardinalityConfig(min = Some(2), max = Some(4)) + ) + ``` + +=== "YAML" + + ```yaml + foreignKeys: + - source: + dataSource: "my_postgres" + step: "customers" + fields: ["customer_id"] + generate: + - dataSource: "my_postgres" + step: "orders" + fields: ["customer_id"] + cardinality: + min: 2 + max: 4 + ``` + +!!! note "How record counts are calculated" + When cardinality is configured, the child step's `records` count is automatically adjusted: + + - **Fixed ratio**: `child records = parent records x ratio` + - **Bounded range**: `child records = parent records x max` (maximum possible) + + The original `records` value on the child step is overridden. The system sets up a `perField` count + internally to group child records by the FK field. + +!!! warning "Do not mix ratio and min/max" + You cannot specify both `ratio` and `min`/`max` in the same cardinality configuration. + Use one approach or the other. + +## Generation modes + +Generation modes control how foreign key values are assigned, specifically whether all records should have +valid FK references or if some should be intentionally invalid. This is configured via the `generationMode` +property on the target relation. + +### all-exist (default) + +Every child record gets a valid FK value referencing an existing parent. This is the default behavior and +produces data with full referential integrity. + +``` +customers orders (all-exist) ++-------------+----------+ +----------+-------------+ +| customer_id | name | | order_id | customer_id | ++-------------+----------+ +----------+-------------+ +| CUST001 | Alice | | ORD001 | CUST001 | <-- valid +| CUST002 | Bob | | ORD002 | CUST003 | <-- valid +| CUST003 | Carol | | ORD003 | CUST002 | <-- valid ++-------------+----------+ | ORD004 | CUST001 | <-- valid + | ORD005 | CUST003 | <-- valid + +----------+-------------+ +``` + +=== "YAML" + + ```yaml + foreignKeys: + - source: + dataSource: "my_postgres" + step: "customers" + fields: ["customer_id"] + generate: + - dataSource: "my_postgres" + step: "orders" + fields: ["customer_id"] + generationMode: "all-exist" # default, can be omitted + ``` + +### partial + +A mix of valid and null FK values. This mode requires both a `nullability` configuration and a `cardinality` +configuration. The cardinality determines how many child records per parent, while the nullability specifies +what percentage of records should have null FKs. Useful for simulating data quality issues or testing how +your application handles missing references. + +!!! warning "Partial mode requires cardinality" + The `partial` generation mode only applies null FK values when combined with a `cardinality` configuration. + Without cardinality, `partial` mode behaves identically to `all-exist` (all FKs are valid). + +``` +customers orders (partial with cardinality, 30% null) ++-------------+----------+ +----------+-------------+ +| customer_id | name | | order_id | customer_id | ++-------------+----------+ +----------+-------------+ +| CUST001 | Alice | | ORD001 | CUST002 | <-- valid +| CUST002 | Bob | | ORD002 | NULL | <-- null +| CUST003 | Carol | | ORD003 | CUST001 | <-- valid ++-------------+----------+ | ORD004 | CUST003 | <-- valid + | ORD005 | NULL | <-- null + +----------+-------------+ +``` + +=== "YAML" + + ```yaml + foreignKeys: + - source: + dataSource: "my_postgres" + step: "customers" + fields: ["customer_id"] + generate: + - dataSource: "my_postgres" + step: "orders" + fields: ["customer_id"] + generationMode: "partial" + cardinality: + ratio: 3.0 # required for partial mode to apply nulls + nullability: + nullPercentage: 0.3 # 30% of records will have null FKs + strategy: "random" + ``` + +### all-combinations + +Generates all possible valid/invalid combinations across FK fields. With `n` FK fields, this produces `2^n` +patterns. Useful for exhaustive testing of how your application handles every possible combination of +valid and invalid references. + +For a single FK field, this produces 2 patterns: valid and invalid. +For two FK fields, this produces 4 patterns: + +Invalid values are generated as `INVALID_` prefixed strings (for string fields) or random numbers (for +numeric fields), not null values. This ensures that invalid references are distinguishable from missing data. + +``` +locations stores (all-combinations with 2 FK fields) ++---------+-------+ +----------+------------+------------+ +| country | state | | store_id | country | state | ++---------+-------+ +----------+------------+------------+ +| USA | NY | | S001 | USA | NY | <-- both valid +| USA | CA | | S002 | USA | INVALID_a1 | <-- country valid, state invalid ++---------+-------+ | S003 | INVALID_b2 | NY | <-- country invalid, state valid + | S004 | INVALID_c3 | INVALID_d4 | <-- both invalid + +----------+------------+------------+ +``` + +!!! note "Invalid values are not null" + The `all-combinations` mode generates `INVALID_` prefixed strings (or random numbers for numeric types) + rather than null values. If you need null FK values, use the `nullability` configuration instead. + +=== "YAML" + + ```yaml + foreignKeys: + - source: + dataSource: "my_postgres" + step: "locations" + fields: ["country", "state"] + generate: + - dataSource: "my_postgres" + step: "stores" + fields: ["country", "state"] + generationMode: "all-combinations" + ``` + +## Nullability + +Nullability controls the percentage of FK values that should be set to `null`, independent of the generation mode. +When used with `all-exist` mode, it applies null values as a post-processing step after valid FK values have been +assigned. + +### Configuration + +| Property | Type | Default | Description | +| ---------------- | -------- | ---------- | ------------------------------------------------ | +| `nullPercentage` | `double` | `0.0` | Percentage of records with null FKs (0.0 to 1.0) | +| `strategy` | `string` | `"random"` | How to select which records get nulls | + +### Strategies + +| Strategy | Description | +| -------- | --------------------------------------------- | +| `random` | Nulls are distributed randomly across records | +| `head` | The first N% of records get null FKs | +| `tail` | The last N% of records get null FKs | + +``` +customers orders (nullability: 20%, strategy: random) ++-------------+----------+ +----------+-------------+ +| customer_id | name | | order_id | customer_id | ++-------------+----------+ +----------+-------------+ +| CUST001 | Alice | | ORD001 | CUST002 | <-- valid +| CUST002 | Bob | | ORD002 | CUST001 | <-- valid +| CUST003 | Carol | | ORD003 | NULL | <-- null (20%) ++-------------+----------+ | ORD004 | CUST003 | <-- valid + | ORD005 | CUST001 | <-- valid + +----------+-------------+ +``` + + +=== "Java" + + ```java + plan().addForeignKeyRelationship( + postgresCustomers, "customer_id", + List.of(Map.entry(postgresOrders, "customer_id")), + new NullabilityConfig(0.2, "random") + ); + ``` + +=== "Scala" + + ```scala + plan.addForeignKeyRelationship( + postgresCustomers, "customer_id", + List(postgresOrders -> "customer_id"), + NullabilityConfig(0.2, "random") + ) + ``` + +=== "YAML" + + ```yaml + foreignKeys: + - source: + dataSource: "my_postgres" + step: "customers" + fields: ["customer_id"] + generate: + - dataSource: "my_postgres" + step: "orders" + fields: ["customer_id"] + nullability: + nullPercentage: 0.2 + strategy: "random" + ``` + +## Combining cardinality, generation mode, and nullability + +These features can be combined to create realistic test scenarios. When combined, they are applied in this order: + +1. **Cardinality** - determines how many child records per parent +2. **Generation mode** - determines whether FK values are valid or invalid +3. **Nullability** - post-processes to null out a percentage of FK values + +!!! warning "Combination restrictions" + - **`partial` mode** requires `cardinality` to produce null FK values. Without cardinality, it behaves + like `all-exist`. + - **`all-combinations` mode** is incompatible with `cardinality`. When both are specified, + `all-combinations` falls back to `all-exist` mode while preserving the cardinality structure. + +### Example: E-commerce order generation + +Generate 100 customers, each with 2-8 orders, where 5% of orders have null customer references. + +``` +customers (100 records) orders (200-800 records, 5% null FKs) ++-------------+----------+ +----------+-------------+--------+ +| customer_id | name | | order_id | customer_id | amount | ++-------------+----------+ +----------+-------------+--------+ +| CUST001 | Alice | | ORD001 | CUST001 | 150.00 | +| CUST002 | Bob | | ORD002 | CUST001 | 200.00 | +| ... | ... | | ORD003 | CUST001 | 75.00 | +| CUST100 | Zara | | ORD004 | NULL | 300.00 | <-- 5% null ++-------------+----------+ | ORD005 | CUST002 | 50.00 | + | ... | ... | ... | + +----------+-------------+--------+ +``` + +=== "YAML" + + ```yaml + version: "1.0" + name: "ecommerce_data" + + sinkOptions: + seed: "42" + + dataSources: + - name: "my_postgres" + connection: + type: "postgres" + options: + url: "jdbc:postgresql://localhost:5432/ecommerce" + user: "postgres" + password: "postgres" + steps: + - name: "customers" + options: + dbtable: "public.customers" + count: + records: 100 + fields: + - name: "customer_id" + options: + regex: "CUST[0-9]{6}" + - name: "name" + options: + expression: "#{Name.fullName}" + + - name: "orders" + options: + dbtable: "public.orders" + count: + records: 100 + fields: + - name: "order_id" + options: + regex: "ORD[0-9]{8}" + - name: "customer_id" + - name: "amount" + type: "double" + options: + min: 10.0 + max: 1000.0 + + foreignKeys: + - source: + dataSource: "my_postgres" + step: "customers" + fields: ["customer_id"] + generate: + - dataSource: "my_postgres" + step: "orders" + fields: ["customer_id"] + cardinality: + min: 2 + max: 8 + nullability: + nullPercentage: 0.05 + strategy: "random" + ``` + +!!! note "Child record count with cardinality" + The `records` count on the child step is automatically adjusted when cardinality is configured. + In the above example, the orders `records: 100` will be overridden to `100 x 8 = 800` (parent count x max). + You can set it to any value since it will be recalculated. + +## Deterministic generation with seed + +To produce the same data across runs, set a `seed` in `sinkOptions`. This ensures that FK value assignment, +nullability selection, and cardinality distribution are all deterministic. + +=== "YAML" + + ```yaml + sinkOptions: + seed: "42" + + foreignKeys: + - source: + dataSource: "db" + step: "parents" + fields: ["id"] + generate: + - dataSource: "db" + step: "children" + fields: ["parent_id"] + cardinality: + ratio: 3.0 + nullability: + nullPercentage: 0.1 + ``` + ## Ordering -When defining relationships/foreign keys, the order matters. The source of the foreign key is generated first, then the children +When defining relationships/foreign keys, the order matters. The source of the foreign key is generated first, then the children foreign keys are generated. This is to ensure that the source data is available for the children to reference. When using the HTTP data sources, it gives you the opportunity to define the order in which the requests are executed. @@ -505,13 +989,44 @@ Below is how you can define the order of the HTTP data sources. - "pathParamid" ``` +## Configuration reference + +### Foreign key target options + +These options are configured on each entry in the `generate` list. + +| Property | Type | Default | Description | +| ---------------- | -------------- | ------------- | --------------------------------------------------------------------- | +| `dataSource` | `string` | required | Name of the target data source | +| `step` | `string` | required | Name of the target step (table/topic/file) | +| `fields` | `list[string]` | required | FK field names in the target, mapped positionally to source fields | +| `cardinality` | `object` | none | One-to-many relationship configuration (see below) | +| `generationMode` | `string` | `"all-exist"` | FK generation strategy: `all-exist`, `partial`, or `all-combinations` | +| `nullability` | `object` | none | Null FK percentage configuration (see below) | + +### Cardinality options + +| Property | Type | Default | Description | +| -------------- | --------- | ----------- | ------------------------------------------------------------------ | +| `ratio` | `double` | none | Exact number of children per parent (e.g. `5.0` = 5 children each) | +| `min` | `integer` | none | Minimum children per parent (used with `max`) | +| `max` | `integer` | none | Maximum children per parent (used with `min`) | +| `distribution` | `string` | `"uniform"` | Distribution pattern: `uniform`, `normal`, or `zipf` | + +### Nullability options + +| Property | Type | Default | Description | +| ---------------- | -------- | ---------- | ----------------------------------------------------------- | +| `nullPercentage` | `double` | `0.0` | Fraction of records with null FKs (0.0 to 1.0) | +| `strategy` | `string` | `"random"` | Null selection strategy: `random`, `head`, or `tail` | + ## Fast Relationships You may want to generate a large number of records whilst retaining relationships across datasets. This consumes a lot of memory as Data Caterer will keep track of generated values and will check for global uniqueness. There are some tactics that can be used to avoid defining a relationships but still maintain the same values across -datasets by leveraging incremental values. When you define an incremental value, it will be globally unique across the +datasets by leveraging incremental values. When you define an incremental value, it will be globally unique across the data generated for that field. Below is an example where you have `accounts` and `transactions` where the same `id` values should appear in both datasets. @@ -527,7 +1042,7 @@ values should appear in both datasets. .fields( field().name("id").type(LongType.instance()).incremental() ); - + var config = configuration() .enableCount(false) .enableSinkMetadata(false) diff --git a/docs/docs/generator/transformation.md b/docs/docs/generator/transformation.md index 8cb8bb40..3a64fd53 100644 --- a/docs/docs/generator/transformation.md +++ b/docs/docs/generator/transformation.md @@ -754,7 +754,7 @@ When you configure a transformation on a step, it will be applied when generatin - Transformation options (from configuration) are passed to the transformer - Same transformer classes work for both full generation and sampling -See the [API Documentation](../api.md#sample-generation) for more details on sample endpoints. +See the [API Documentation](../api.md#sample-data-generation) for more details on sample endpoints. ## Example Transformers diff --git a/docs/get-started/quick-start.md b/docs/get-started/quick-start.md index 68a38683..2f15ed70 100644 --- a/docs/get-started/quick-start.md +++ b/docs/get-started/quick-start.md @@ -156,7 +156,7 @@ A web interface for creating and running data generation plans. ### Run ```shell -docker run -d -p 9898:9898 -e DEPLOY_MODE=standalone --name datacaterer datacatering/data-caterer:0.19.0 +docker run -d -p 9898:9898 -e DEPLOY_MODE=standalone --name datacaterer datacatering/data-caterer:0.19.1 ``` Open [http://localhost:9898](http://localhost:9898) in your browser. @@ -203,6 +203,6 @@ After running, check the generated report: --- - [Supported connections](../docs/connection/index.md) - databases, files, messaging, and HTTP. + [Supported connections](../docs/connection.md) - databases, files, messaging, and HTTP. diff --git a/docs/index.md b/docs/index.md index 73bde07b..c9328e49 100644 --- a/docs/index.md +++ b/docs/index.md @@ -552,7 +552,7 @@ validating the downstream data sources have the correct information. | Type | Interface | User | |-----------|--------------------------------------------------------|--------------------------------------| -| No Code | [UI](get-started/quick-start.md#mac) | QA, Testers, Data Scientist, Analyst | +| No Code | [UI](get-started/quick-start.md) | QA, Testers, Data Scientist, Analyst | | Low Code | [YAML](get-started/quick-start.md#yaml) | DevOps, Kubernetes Fans | | High Code | [Java/Scala](get-started/quick-start.md#javascala-api) | Software Developers, Data Engineers | diff --git a/docs/migrations/README.md b/docs/migrations/README.md index f7b2f8e8..98d3bd75 100644 --- a/docs/migrations/README.md +++ b/docs/migrations/README.md @@ -4,7 +4,7 @@ This directory contains migration guides and tools for upgrading between Data Ca ## Available Migrations -### [YAML Unified Format Migration](yaml-unified-format/) +### [YAML Unified Format Migration](yaml-unified-format/README.md) **From:** Legacy YAML format (separate plan + task files) **To:** Unified YAML format v1.0 (single-file configuration) diff --git a/docs/migrations/yaml-unified-format/MIGRATION.md b/docs/migrations/yaml-unified-format/MIGRATION.md index 3fea2083..dc8ad6ca 100644 --- a/docs/migrations/yaml-unified-format/MIGRATION.md +++ b/docs/migrations/yaml-unified-format/MIGRATION.md @@ -220,14 +220,14 @@ python3 migrate_yaml.py plan.yaml --task-folder /path/to/tasks ``` ### Issue: Unsupported field options -**Solution**: Some advanced options may need manual adjustment. Check the [unified YAML schema](misc/schema/unified-config-schema.json) for supported options. +**Solution**: Some advanced options may need manual adjustment. Check the [unified YAML schema](https://github.com/data-catering/data-caterer/blob/176baa3762ccdca05b024ccd2efcd6335359e713/misc/schema/unified-config-schema.json) for supported options. ### Issue: Validation references **Solution**: Update validation `dataSourceName` to match the new data source names in the unified format. ## Getting Help -- **Documentation**: See [Configuration Guide](docs/docs/configuration.md) +- **Documentation**: See [Configuration Guide](../../docs/configuration.md) - **Examples**: Check `misc/schema/examples/` for unified YAML examples - **Issues**: Report migration problems at [GitHub Issues](https://github.com/data-catering/data-caterer/issues) diff --git a/docs/use-case/changelog/0.15.0.md b/docs/use-case/changelog/0.15.0.md index 84cf133e..ea957d8b 100644 --- a/docs/use-case/changelog/0.15.0.md +++ b/docs/use-case/changelog/0.15.0.md @@ -15,7 +15,7 @@ Latest feature and fixes for Data Catering include: - Add in `bigquery` as a data source - [Check BigQuery documentation here](../../docs/guide/data-source/database/bigquery.md) - Allow for empty sequences to be generated for per field counts - - [Check count per field documentation here](../../docs/generator/count.md#per-field) + - [Check count per field documentation here](../../docs/generator/count.md) - Calculate number of records generated based on foreign key definitions - [Check foreign key documentation here](../../docs/generator/foreign-key.md) - Unpersist DataFrame after generating data to avoid OOM errors diff --git a/docs/use-case/changelog/0.16.10.md b/docs/use-case/changelog/0.16.10.md index 1bbfff70..1f877ba4 100644 --- a/docs/use-case/changelog/0.16.10.md +++ b/docs/use-case/changelog/0.16.10.md @@ -16,13 +16,13 @@ Latest features and fixes for Data Catering include: - When UI sends a plan with only task summaries, system automatically discovers and loads full YAML plan by name - Searches configured plan directory for matching YAML files - Supports mixing UI-created plans with YAML-defined tasks - - [Check here for documentation](../../docs/api.md#yaml-plans-and-connection-configuration) + - [Check here for documentation](../../docs/api.md) - **Connection Configuration Auto-Merge**: Connection details from `application.conf` automatically merge with task configurations - Connection format (e.g., `json`, `postgres`, `kafka`) derived from top-level configuration key - Eliminates duplication of connection details across task files - Step-specific options override connection defaults (proper precedence) - - [Check here for documentation](../../docs/connection.md#automatic-connection-merging-with-yaml-plans) + - [Check here for documentation](../../docs/connection.md) ## Core Engine Improvements diff --git a/docs/use-case/changelog/0.16.2.md b/docs/use-case/changelog/0.16.2.md index 1a2457f2..abfbfc16 100644 --- a/docs/use-case/changelog/0.16.2.md +++ b/docs/use-case/changelog/0.16.2.md @@ -20,6 +20,6 @@ Latest feature and fixes for Data Catering include: - Previous order was `regex`, `oneOf`, `expression`, `sql`, `random` - [Check data generation documentation here](../../docs/generator/data-generator.md) - Fix bug for deeply nested SQL fields not being applied correctly - - [Check SQL field generation documentation here](../../docs/generator/data-generator.md#sql) + - [Check SQL field generation documentation here](../../docs/generator/data-generator.md) - Add in `enableFastGeneration` for automatically applying optimizations for faster completion of data generation - [Check fast generation mode documentation here](../../docs/configuration.md#fast-generation-mode) diff --git a/docs/use-case/changelog/0.16.3.md b/docs/use-case/changelog/0.16.3.md index d2b4b4a2..da5c63fc 100644 --- a/docs/use-case/changelog/0.16.3.md +++ b/docs/use-case/changelog/0.16.3.md @@ -13,4 +13,4 @@ Latest feature and fixes for Data Catering include: - Fix bug when using foreign key relationships with nested fields beyond 3 levels - [Check foreign key documentation here](../../docs/generator/foreign-key.md) - Allow for referencing array fields in SQL generated fields - - [Check SQL field generation documentation here](../../docs/generator/data-generator.md#sql) + - [Check SQL field generation documentation here](../../docs/generator/data-generator.md) diff --git a/docs/use-case/changelog/0.16.4.md b/docs/use-case/changelog/0.16.4.md index bf8c9ad5..c6128778 100644 --- a/docs/use-case/changelog/0.16.4.md +++ b/docs/use-case/changelog/0.16.4.md @@ -11,4 +11,4 @@ Deployed: 24-09-2025 Latest feature and fixes for Data Catering include: - Fix bug when using `omit` fields in SQL generated fields - - [Check SQL field generation documentation here](../../docs/generator/data-generator.md#sql) + - [Check SQL field generation documentation here](../../docs/generator/data-generator.md) diff --git a/docs/use-case/changelog/0.16.5.md b/docs/use-case/changelog/0.16.5.md index ea9d5806..4cefd508 100644 --- a/docs/use-case/changelog/0.16.5.md +++ b/docs/use-case/changelog/0.16.5.md @@ -11,6 +11,6 @@ Deployed: 25-09-2025 Latest feature and fixes for Data Catering include: - Allow for deeply nested SQL references in arrays or objects - - [Check SQL field generation documentation here](../../docs/generator/data-generator.md#sql) + - [Check SQL field generation documentation here](../../docs/generator/data-generator.md) - Ability to use `unwrapTopLevelArray` to allow for top-level JSON arrays - [Check JSON data source documentation here](../../docs/guide/data-source/file/json.md) diff --git a/docs/use-case/changelog/0.16.8.md b/docs/use-case/changelog/0.16.8.md index e2dfab1c..c655fade 100644 --- a/docs/use-case/changelog/0.16.8.md +++ b/docs/use-case/changelog/0.16.8.md @@ -13,7 +13,7 @@ Latest feature and fixes for Data Catering include: - Introduce sample-data generation APIs with docs, new Gradle run tasks, improved logging, and stricter sink format validation with accompanying tests. - API/UI: - New endpoints: POST /sample/task-file, POST /sample/task-yaml (supports raw YAML and JSON), POST /sample/schema in PlanRoutes. - - [Check API documentation here](../../docs/api.md#sample-data-generation-endpoints) + - [Check API documentation here](../../docs/api.md#sample-data-generation) - Request models & unmarshaller: Add SampleModels and custom TaskYamlUnmarshaller. - Repository wiring: PlanRepository adds GenerateFromTaskFile/TaskYaml/Schema commands using FastSampleGenerator; Spark warm-up on startup. - Core/Generation: diff --git a/docs/use-case/changelog/0.19.1.md b/docs/use-case/changelog/0.19.1.md new file mode 100644 index 00000000..fd325478 --- /dev/null +++ b/docs/use-case/changelog/0.19.1.md @@ -0,0 +1,37 @@ +--- +title: "Data Caterer 0.19.1 release notes" +description: "Foreign key cardinality and nullability controls, Hadoop compatibility fix, and documentation improvements." +image: "https://data.catering/diagrams/logo/data_catering_logo.svg" +--- + +# 0.19.1 + +Deployed: 14-02-2026 + +Foreign key relationships get cardinality, generation modes, and nullability controls, plus a Hadoop compatibility fix and documentation cleanup. + +## Foreign Key Enhancements + +- **Cardinality (one-to-many)**: Configure fixed ratio or bounded min/max child records per parent for foreign key relationships + ([Cardinality docs](../../docs/generator/foreign-key.md#cardinality-one-to-many)) +- **Generation modes**: Control FK value validity with `all-exist`, `partial`, and `all-combinations` modes for testing referential integrity scenarios + ([Generation modes docs](../../docs/generator/foreign-key.md#generation-modes)) +- **Nullability**: Configure percentage of null FK values with `random`, `head`, or `tail` strategies + ([Nullability docs](../../docs/generator/foreign-key.md#nullability)) +- **Deterministic generation**: Seed support for reproducible FK assignment, nullability selection, and cardinality distribution + ([Seed docs](../../docs/generator/foreign-key.md#deterministic-generation-with-seed)) +- **Configuration reference**: Complete reference table for all foreign key target, cardinality, and nullability options + ([Reference docs](../../docs/generator/foreign-key.md#configuration-reference)) + +## Bug Fixes + +- **Hadoop service file merging**: Added `mergeServiceFiles()` to shadow JAR build to resolve Hadoop filesystem provider conflicts +- **Step format inference**: Automatically set the `format` option from step `type` when not explicitly provided, fixing issues with YAML-configured file data sources +- **Logging improvements**: Reduced noisy `ForeignKeyUniquenessProcessor` log from INFO to DEBUG; added task names to post-processor log output + +## Documentation + +- **Foreign key documentation**: Comprehensive rewrite with visual diagrams, code examples in Java/Scala/YAML, and a full configuration reference +- **Broken link fixes**: Fixed invalid anchor links across changelogs and documentation pages +- **Data Caterer YAML metadata source**: Added to mkdocs navigation +- **Version bump**: Updated Docker image references, quick-start guide, and example Dockerfile to 0.19.1 diff --git a/docs/use-case/roadmap.md b/docs/use-case/roadmap.md index 8860157f..00cbc44a 100644 --- a/docs/use-case/roadmap.md +++ b/docs/use-case/roadmap.md @@ -10,22 +10,22 @@ Items below summarise the roadmap of Data Caterer. As each task gets completed, | Feature | Description | Sub Tasks | |----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Data source support | Batch or real time data sources that can be added to Data Caterer. Support data sources that users want | - :white_check_mark: AWS, GCP and Azure related data services ([cloud storage](../docs/advanced.md#cloud-storage))
- :white_check_mark: [Delta Lake](../docs/guide/data-source/file/delta-lake.md)
- :white_check_mark: [Iceberg](../docs/guide/data-source/file/iceberg.md)
- :white_check_mark: [Hudi](../docs/guide/data-source/file/hudi.md)
- :white_check_mark: [RabbitMQ](../docs/guide/data-source/jms/rabbitmq.md)
- :white_check_mark: [Solace](../docs/guide/data-source/jms/solace.md)
- :white_check_mark: [BigQuery](../docs/guide/data-source/database/bigquery.md)
- ActiveMQ
- MongoDB
- [Elasticsearch](https://github.com/data-catering/data-caterer/issues/7)
- [Snowflake](https://github.com/data-catering/data-caterer/issues/6)
- [Databricks](https://github.com/data-catering/data-caterer/issues/5)
- Pulsar | -| Metadata discovery | Allow for schema and data profiling from external metadata sources | - :white_check_mark: [HTTP (OpenAPI spec)](../docs/guide/data-source/http/http.md)
- :white_check_mark: [JSON Schema](../docs/guide/data-source/metadata/json-schema.md)
- :white_check_mark: [YAML configurations](../docs/guide/data-source/metadata/yaml-configurations.md)
- :white_check_mark: [OpenLineage metadata (Marquez)](../docs/guide/data-source/metadata/marquez.md)
- :white_check_mark: [OpenMetadata](../docs/guide/data-source/metadata/open-metadata.md)
- :white_check_mark: [Open Data Contract Standard (ODCS)](../docs/guide/data-source/metadata/open-data-contract-standard.md)
- :white_check_mark: [Data Contract CLI](../docs/guide/data-source/metadata/data-contract-cli.md)
- :white_check_mark: [Confluent Schema Registry](../docs/guide/data-source/metadata/confluent-schema-registry.md)
- Amundsen
- Datahub
- Solace Event Portal
- Airflow
- [DBT](https://github.com/data-catering/data-caterer/issues/8)
- Manually insert create table statement from UI | +| Data source support | Batch or real time data sources that can be added to Data Caterer. Support data sources that users want | - :white_check_mark: AWS, GCP and Azure related data services ([cloud storage](../docs/advanced.md#cloud-storage))
- :white_check_mark: [Delta Lake](../docs/guide/data-source/file/delta-lake.md)
- :white_check_mark: [Iceberg](../docs/guide/data-source/file/iceberg.md)
- :white_check_mark: [RabbitMQ](../docs/guide/data-source/messaging/rabbitmq.md)
- :white_check_mark: [Kafka](../docs/guide/data-source/messaging/kafka.md)
- :white_check_mark: [Solace](../docs/guide/data-source/messaging/solace.md)
- :white_check_mark: [BigQuery](../docs/guide/data-source/database/bigquery.md)
- ActiveMQ
- MongoDB
- [Elasticsearch](https://github.com/data-catering/data-caterer/issues/7)
- [Snowflake](https://github.com/data-catering/data-caterer/issues/6)
- [Databricks](https://github.com/data-catering/data-caterer/issues/5)
- Pulsar | +| Metadata discovery | Allow for schema and data profiling from external metadata sources | - :white_check_mark: [HTTP (OpenAPI spec)](../docs/guide/data-source/http/http.md)
- :white_check_mark: [JSON Schema](../docs/guide/data-source/metadata/json-schema.md)
- :white_check_mark: [YAML configurations](../docs/guide/data-source/metadata/yaml-configurations.md)
- :white_check_mark: [OpenLineage metadata (Marquez)](../docs/guide/data-source/metadata/marquez.md)
- :white_check_mark: [OpenMetadata](../docs/guide/data-source/metadata/open-metadata.md)
- :white_check_mark: [Open Data Contract Standard (ODCS)](../docs/guide/data-source/metadata/open-data-contract-standard.md)
- :white_check_mark: [Data Contract CLI](../docs/guide/data-source/metadata/data-contract-cli.md)
- Amundsen
- Datahub
- Solace Event Portal
- Airflow
- [DBT](https://github.com/data-catering/data-caterer/issues/8)
- Manually insert create table statement from UI | | Developer API | Scala/Java interface for developers/testers to create data generation and validation tasks | - :white_check_mark: [Scala](https://github.com/data-catering/data-caterer-example)
- :white_check_mark: [Java](https://github.com/data-catering/data-caterer-example)
- :white_check_mark: [YAML](../docs/guide/data-source/metadata/yaml-configurations.md)
- Python
- Javascript | | Report generation | Generate a report that summarises the data generation or validation results | - :white_check_mark: [Report for data generated and validation rules](../sample/report/html/index.html) | -| UI portal | Allow users to access a UI to input data generation or validation tasks. Also be able to view report results | - :white_check_mark: [Base UI with create, edit and delete plan, connections and history](../get-started/quick-start.md)
- :white_check_mark: [Run on Mac, Linux and Windows](../get-started/quick-start.md)
- :white_check_mark: User authentication and usage tracking
- :white_check_mark: Store data generation/validation run information in file/database
- :white_check_mark: [Preview of generated data via sample endpoints](../docs/sample.md)
- Metadata stored in database
- Additional dialog to confirm delete and execute plan | +| UI portal | Allow users to access a UI to input data generation or validation tasks. Also be able to view report results | - :white_check_mark: [Base UI with create, edit and delete plan, connections and history](../get-started/quick-start.md)
- :white_check_mark: [Run on Mac, Linux and Windows](../get-started/quick-start.md)
- :white_check_mark: User authentication and usage tracking
- :white_check_mark: Store data generation/validation run information in file/database
- :white_check_mark: Preview of generated data via sample endpoints
- Metadata stored in database
- Additional dialog to confirm delete and execute plan | | Integration with data validation tools | Derive data validation rules from existing data validation tools | - :white_check_mark: [Great Expectation](../docs/validation/external-source-validation.md#great-expectations)
- :white_check_mark: [OpenMetadata](../docs/validation/external-source-validation.md#openmetadata)
- [DBT constraints](https://docs.getdbt.com/reference/resource-properties/constraints)
- [SodaCL](https://docs.soda.io/soda-cl/soda-cl-overview.html)
- [MonteCarlo](https://docs.getmontecarlo.com/docs/monitors-as-code) | | Data validation rule suggestions | Based on metadata, generate data validation rules appropriate for the dataset | - :white_check_mark: Suggest basic data validations (yet to document) | | Wait conditions before data validation | Define certain conditions to be met before starting data validations | - :white_check_mark: [Webhook](../docs/validation.md#webhook)
- :white_check_mark: [File exists](../docs/validation.md#file-exists)
- :white_check_mark: [Data exists via SQL expression](../docs/validation.md#data-exists)
- :white_check_mark: [Pause](../docs/validation.md#pause) | | Validation types | Ability to define simple/complex data validations | - :white_check_mark: [Basic validations](../docs/validation/basic-validation.md)
- :white_check_mark: [Aggregates](../docs/validation/group-by-validation.md) (sum of amount per account is > 500)
- :white_check_mark: [Relationship](../docs/validation/upstream-data-source-validation.md) (at least one account entry in history table per account in accounts table)
- :white_check_mark: [Field name (check field count, field names, ordering)](../docs/validation/field-name-validation.md)
- :white_check_mark: [Pre-conditions before validating data](https://github.com/data-catering/data-caterer/issues/3)
- Ordering (transactions are ordered by date)
- Data profile (how close the generated data profile is compared to the expected data profile) | -| Data generation features | Advanced data generation capabilities for realistic test data | - :white_check_mark: [Custom transformations](../docs/generator/transformation.md) (per-record and whole-file)
- :white_check_mark: [Distribution-based generation](../docs/generator/data-generator.md#distributions) (normal, exponential)
- :white_check_mark: [Weighted value selection](../docs/generator/data-generator.md#weighted-values)
- :white_check_mark: [Reference mode for foreign keys](../docs/generator/data-generator.md#reference-mode)
- :white_check_mark: [Field filtering](../docs/guide/data-source/metadata/json-schema.md#field-filtering) (include/exclude patterns)
- :white_check_mark: [Cover all possible cases (i.e. record for each combination of oneOf values, positive/negative values, pairwise etc.)](https://github.com/data-catering/data-caterer/issues/4)
- Ability to override edge cases | -| Performance optimization | Features to improve data generation speed and efficiency | - :white_check_mark: [Fast regex generation](../docs/generator/data-generator.md#regex-patterns) (SQL-based, ~5-6x faster)
- :white_check_mark: [Unique value optimization with Bloom filters](../docs/configuration.md#unique-value-configuration)
- :white_check_mark: [Fast generation mode](../docs/configuration.md#fast-generation-mode) (automatic optimizations)
- :white_check_mark: [Performance testing infrastructure](../docs/use-case/changelog/0.17.1.md)
- :white_check_mark: [HTTP rate limiting](../docs/guide/data-source/http/http.md#rate-limiting) | +| Data generation features | Advanced data generation capabilities for realistic test data | - :white_check_mark: [Custom transformations](../docs/generator/transformation.md) (per-record and whole-file)
- :white_check_mark: [Distribution-based generation](../docs/generator/data-generator.md) (normal, exponential)
- :white_check_mark: [Weighted value selection](../docs/generator/data-generator.md)
- :white_check_mark: [Reference mode for foreign keys](../docs/generator/data-generator.md#reference-mode)
- :white_check_mark: [Field filtering](../docs/guide/data-source/metadata/json-schema.md#field-filtering) (include/exclude patterns)
- :white_check_mark: [Cover all possible cases (i.e. record for each combination of oneOf values, positive/negative values, pairwise etc.)](https://github.com/data-catering/data-caterer/issues/4)
- Ability to override edge cases | +| Performance optimization | Features to improve data generation speed and efficiency | - :white_check_mark: [Fast regex generation](../docs/generator/data-generator.md#regex-patterns) (SQL-based, ~5-6x faster)
- :white_check_mark: [Unique value optimization with Bloom filters](../docs/configuration.md)
- :white_check_mark: [Fast generation mode](../docs/configuration.md#fast-generation-mode) (automatic optimizations)
- :white_check_mark: Performance testing infrastructure
- :white_check_mark: [HTTP rate limiting](../docs/guide/data-source/http/http.md) | | Alerting | When tasks have completed, ability to define alerts based on certain conditions | - :white_check_mark: [Slack](../docs/report/alert.md#slack)
- Email | | Metadata enhancements | Based on data profiling or inference, can add to existing metadata | - PII detection (can integrate with [Presidio](https://microsoft.github.io/presidio/analyzer/))
- Relationship detection across data sources
- SQL generation
- Ordering information | | Data cleanup | Ability to clean up generated data | - :white_check_mark: [Clean up generated data](../docs/guide/scenario/delete-generated-data.md)
- :white_check_mark: [Clean up data in consumer data sinks](../docs/delete-data.md)
- Clean up data from real time sources (i.e. DELETE HTTP endpoint, delete events in JMS) | | Trial version | Trial version of the full app for users to test out all the features | - :white_check_mark: [Trial app to try out all features](../get-started/quick-start.md) | | Code generation | Based on metadata or existing classes, code for data generation and validation could be generated | - Code generation
- Schema generation from Scala/Java class | | Real time response data validations | Ability to define data validations based on the response from real time data sources (e.g. HTTP response) | - :white_check_mark: [HTTP response data validation](../docs/guide/data-source/http/http.md#validation) | -| Infrastructure & CI/CD | Infrastructure improvements for development and deployment | - :white_check_mark: Pre-plan and post-plan processors
- :white_check_mark: [Benchmark results tracking](../use-case/benchmark/README.md) | +| Infrastructure & CI/CD | Infrastructure improvements for development and deployment | - :white_check_mark: Pre-plan and post-plan processors
- :white_check_mark: Benchmark results tracking | diff --git a/example/Dockerfile b/example/Dockerfile index bd907a7f..dfba0adb 100644 --- a/example/Dockerfile +++ b/example/Dockerfile @@ -20,7 +20,7 @@ COPY src ./src RUN ./gradlew clean build --no-daemon # Stage 2: Runtime image based on Data Caterer -ARG DATA_CATERER_VERSION=0.19.0 +ARG DATA_CATERER_VERSION=0.19.1 FROM datacatering/data-caterer:${DATA_CATERER_VERSION} # Copy the built JAR from the builder stage diff --git a/gradle.properties b/gradle.properties index f0f27781..c2de0823 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ group=io.github.data-catering -version=0.19.0 +version=0.19.1 org.gradle.parallel=true org.gradle.caching=true diff --git a/misc/feature-catalog/README.md b/misc/feature-catalog/README.md new file mode 100644 index 00000000..d604bc45 --- /dev/null +++ b/misc/feature-catalog/README.md @@ -0,0 +1,105 @@ +# Feature Catalog + +Standardized documentation of all Data Caterer features, extracted semi-automatically from source code. + +## Quick Start + +```bash +# Generate features.json from source code +python3 scripts/extract_features.py + +# Generate Markdown documentation from features.json +python3 scripts/generate_markdown.py +``` + +## Structure + +``` +feature-catalog/ +├── README.md # This file +├── features.json # Master feature catalog (generated) +├── schema/ +│ └── feature-metadata-schema.json # JSON Schema for feature metadata +├── scripts/ +│ ├── utils.py # Shared utilities +│ ├── extract_features.py # Extract features from source code +│ └── generate_markdown.py # Generate Markdown from features.json +└── docs/ + ├── index.md # Main overview and navigation + └── categories/ + ├── connectors.md # Data source connectors (16) + ├── generation.md # Data generation features (55) + ├── validation.md # Validation features (42) + ├── configuration.md # Configuration options (29) + ├── advanced.md # Advanced features (11) + ├── metadata.md # Metadata integration (10) + └── ui-api.md # UI and API features (6) +``` + +## Features Overview + +| Category | Count | Description | +|----------|-------|-------------| +| Data Source Connectors | 16 | Databases, files, messaging, HTTP | +| Data Generation | 55 | Regex, faker, SQL, field options, types | +| Data Validation | 42 | Field, statistical, expression, cross-source | +| Configuration | 29 | Flags, folders, runtime, alerts | +| Advanced | 11 | Foreign keys, streaming, transformations | +| Metadata Integration | 10 | OpenMetadata, Great Expectations, etc. | +| UI/API | 6 | Web UI features | +| **Total** | **169** | | + +## How It Works + +### Extraction Sources + +The extraction script parses these source files: + +1. **Constants.scala** - All configuration constants, validation types, data source types +2. **ConfigModels.scala** - Configuration case classes with defaults +3. **Unified config schema** - YAML schema with all supported properties +4. **YAML examples** - Real-world usage patterns + +### Feature Metadata Schema + +Each feature has: +- **id**: Unique identifier (dot notation, e.g., `generation.field.regex`) +- **name**: Human-readable name +- **category**: One of: connectors, generation, validation, configuration, advanced, metadata, ui_api +- **status**: stable, experimental, deprecated, planned +- **description**: What the feature does +- **configuration**: Array of config options (name, type, default, scope, YAML path) +- **examples**: YAML/Scala code examples +- **tags**: Searchable tags +- **sourceFiles**: Implementation locations +- **relatedFeatures**: Links to related features + +## Updating the Catalog + +When adding new features to Data Caterer: + +1. Add constants to `Constants.scala` / config to `ConfigModels.scala` +2. Run `python3 scripts/extract_features.py` to regenerate `features.json` +3. Run `python3 scripts/generate_markdown.py` to regenerate docs +4. Review and commit + +For features not captured by automated extraction, manually add them to `extract_features.py` in the appropriate extraction function. + +## Adapting for Other Projects + +This system is designed to be reusable: + +1. **Replace extraction sources**: Modify `extract_features.py` to parse your config files +2. **Customize schema**: Extend `feature-metadata-schema.json` with custom fields +3. **Adjust categories**: Update `CATEGORY_META` in `generate_markdown.py` +4. **Change output**: Swap `generate_markdown.py` for your preferred format + +The core `utils.py` provides generic utilities for: +- Parsing Scala lazy vals and case classes +- Creating standardized feature/config/example objects +- File I/O helpers + +## Requirements + +- Python 3.10+ +- No external dependencies (uses only stdlib) diff --git a/misc/feature-catalog/docs/categories/advanced.md b/misc/feature-catalog/docs/categories/advanced.md new file mode 100644 index 00000000..69c29268 --- /dev/null +++ b/misc/feature-catalog/docs/categories/advanced.md @@ -0,0 +1,306 @@ +# Advanced Features + +Foreign key relationships, streaming load patterns, custom transformations, metadata-driven generation, and more. + +**11 features** in this category. + +## Table of Contents + +- [Foreign Key Relationships](#referential-integrity) (4 features) +- [Record Count](#count) (1 features) +- [Streaming Settings](#streaming) (1 features) +- [Transformation](#transformation) (1 features) +- [Step Options](#step-options) (1 features) +- [Reference Mode](#reference) (1 features) +- [Interfaces](#interfaces) (1 features) +- [Configuration](#configuration) (1 features) + +## Foreign Key Relationships + +### Foreign Key Relationships + +**ID**: `advanced.foreign_keys` +**Status**: Stable + +Define foreign key relationships between data sources to maintain referential integrity. Supports composite keys, cardinality control, nullability, and multiple generation modes. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `source` | object | Yes | `-` | Source table containing primary key YAML: `foreignKeys[].source` | +| `generate` | array | No | `-` | Target tables with foreign key references YAML: `foreignKeys[].generate` | +| `delete` | array | No | `-` | Target tables for cleanup YAML: `foreignKeys[].delete` | + +**Examples**: + +**Foreign key with cardinality**: +```yaml +foreignKeys: + - source: + dataSource: postgres_db + step: customers + fields: ["customer_id"] + generate: + - dataSource: postgres_db + step: orders + fields: ["customer_id"] + cardinality: + min: 1 + max: 10 + distribution: "uniform" +``` + +**Tags**: `advanced`, `foreign-key`, `referential-integrity`, `relationship` + +--- + +### Foreign Key Cardinality Control + +**ID**: `advanced.foreign_key_cardinality` +**Status**: Stable + +Control the cardinality of foreign key relationships. Set min/max records per parent, ratio multipliers, and distribution patterns. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `min` | integer | No | `-` | Minimum records per parent key YAML: `foreignKeys[].generate[].cardinality.min` | +| `max` | integer | No | `-` | Maximum records per parent key YAML: `foreignKeys[].generate[].cardinality.max` | +| `ratio` | double | No | `-` | Ratio multiplier (e.g., 10.0 = 10x parent records) YAML: `foreignKeys[].generate[].cardinality.ratio` | +| `distribution` | enum | No | `uniform` | Cardinality distribution Values: `uniform`, `normal`, `zipf`, `power` YAML: `foreignKeys[].generate[].cardinality.distribution` | + +**Tags**: `advanced`, `cardinality`, `distribution`, `foreign-key` + +--- + +### Foreign Key Nullability + +**ID**: `advanced.foreign_key_nullability` +**Status**: Stable + +Control null value injection in foreign key fields. Configure percentage of nulls and distribution strategy (random, head, tail). + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `nullPercentage` | double | No | `-` | Percentage of null values (0-1) YAML: `foreignKeys[].generate[].nullability.nullPercentage` | +| `strategy` | enum | No | `random` | Null distribution strategy Values: `random`, `leading`, `trailing` YAML: `foreignKeys[].generate[].nullability.strategy` | + +**Tags**: `advanced`, `nullability`, `foreign-key`, `null` + +--- + +### Foreign Key Generation Modes + +**ID**: `advanced.foreign_key_generation_modes` +**Status**: Stable + +Control how foreign key values are generated. "all-exist" ensures all records have valid FKs, "all-combinations" generates all possible combinations, "partial" creates a mix of valid and invalid references. + +**Use Cases**: +- all-exist: Standard referential integrity testing +- all-combinations: Comprehensive join testing with all possible combinations +- partial: Testing handling of orphan records and broken references + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `generationMode` | enum | No | `all-exist` | FK generation strategy Values: `all-exist`, `all-combinations`, `partial` | + +**Tags**: `advanced`, `foreign-key`, `generation-mode` + +--- + +## Record Count + +### Record Count Configuration + +**ID**: `advanced.count` +**Status**: Stable + +Configure how many records to generate per step. Supports fixed count, per-field distribution, and streaming rate-based generation. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `records` | integer | No | `1000` | Total records to generate YAML: `dataSources[].steps[].count.records` | +| `perField` | object | No | `-` | Generate records per unique field value YAML: `dataSources[].steps[].count.perField` | + +**Examples**: + +**Fixed count**: +```yaml +count: + records: 5000 +``` + +**Per-field count distribution**: +```yaml +count: + records: 100 + perField: + fieldNames: ["account_id"] + options: + min: 1 + max: 5 +``` + +**Tags**: `advanced`, `count`, `records`, `distribution` + +--- + +## Streaming Settings + +### Streaming Load Patterns + +**ID**: `advanced.streaming_load_patterns` +**Status**: Stable + +Define time-based data generation patterns for streaming scenarios. Supports ramp, spike, sine, and custom step patterns. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `duration` | string | No | `-` | Streaming duration (e.g., 10m, 1h) YAML: `dataSources[].steps[].count.duration` | +| `rate` | integer | No | `-` | Records per time unit YAML: `dataSources[].steps[].count.rate` | +| `rateUnit` | enum | No | `-` | Time unit for rate Values: `second`, `minute`, `hour` YAML: `dataSources[].steps[].count.rateUnit` | +| `pattern.type` | enum | No | `-` | Load pattern type Values: `ramp`, `spike`, `sine`, `steps` YAML: `dataSources[].steps[].count.pattern.type` | +| `pattern.startRate` | integer | No | `-` | Starting rate for ramp pattern | +| `pattern.endRate` | integer | No | `-` | Ending rate for ramp pattern | +| `pattern.baseRate` | integer | No | `-` | Base rate for spike pattern | +| `pattern.spikeRate` | integer | No | `-` | Spike rate | +| `pattern.amplitude` | integer | No | `-` | Amplitude for sine pattern | +| `pattern.frequency` | double | No | `-` | Frequency for sine pattern | +| `pattern.steps` | array | No | `-` | Custom step definitions with rate and duration | + +**Examples**: + +**Ramp load pattern**: +```yaml +count: + duration: "1m" + rate: 100 + rateUnit: "second" + pattern: + type: "ramp" + startRate: 10 + endRate: 200 +``` + +**Tags**: `advanced`, `streaming`, `load-pattern`, `rate` + +--- + +## Transformation + +### Post-Generation Transformation + +**ID**: `advanced.transformation` +**Status**: Stable + +Apply custom Java/Scala transformations to generated data before writing to output. Supports whole-file and row-by-row modes. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `className` | string | Yes | `-` | Fully qualified transformation class name YAML: `dataSources[].steps[].transformation.className` | +| `methodName` | string | No | `transform` | Method to call | +| `mode` | enum | No | `-` | Transformation mode Values: `whole-file`, `row-by-row` | +| `outputPath` | string | No | `-` | Output directory | +| `deleteOriginal` | boolean | No | `-` | Delete input after transformation | +| `enabled` | boolean | No | `true` | Enable/disable transformation | + +**Tags**: `advanced`, `transformation`, `custom`, `plugin` + +--- + +## Step Options + +### Step Field Filtering + +**ID**: `advanced.step_options` +**Status**: Stable + +Include or exclude fields from metadata-driven generation using exact names or patterns. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `includeFields` | array | No | `-` | List of field names to include | +| `excludeFields` | array | No | `-` | List of field names to exclude | +| `includeFieldPatterns` | array | No | `-` | Regex patterns for fields to include | +| `excludeFieldPatterns` | array | No | `-` | Regex patterns for fields to exclude | +| `allCombinations` | boolean | No | `-` | Generate all field value combinations | + +**Tags**: `advanced`, `step`, `filtering`, `metadata` + +--- + +## Reference Mode + +### Reference Mode + +**ID**: `advanced.reference_mode` +**Status**: Stable + +Load existing data as reference for foreign key relationships instead of generating new data. Useful when you need realistic FK values from existing datasets. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableReferenceMode` | boolean | No | `false` | Enable reference mode for this data source | +| `enableDataGeneration` | boolean | No | `true` | Disable generation (use with reference mode) | + +**Tags**: `advanced`, `reference`, `existing-data`, `foreign-key` + +--- + +## Interfaces + +### Configuration Interfaces + +**ID**: `advanced.interfaces` +**Status**: Stable + +Data Caterer supports multiple configuration interfaces: Java API, Scala API, YAML configuration, and Web UI. + +**Use Cases**: +- Java API: Programmatic configuration from Java applications +- Scala API: Programmatic configuration with Scala builders +- YAML: Declarative configuration for CI/CD and automation +- Web UI: Visual configuration and execution management + +**Tags**: `advanced`, `interface`, `api`, `yaml`, `ui` + +--- + +## Configuration + +### Environment Variable Substitution + +**ID**: `advanced.env_substitution` +**Status**: Stable + +Use ${VAR_NAME} syntax in YAML configuration to substitute environment variables at runtime. Supports default values with ${VAR:-default}. + +**Examples**: + +**Environment variable substitution**: +```yaml +options: + password: "${DB_PASSWORD}" + url: "${KAFKA_BROKERS:-localhost:9092}" +``` + +**Tags**: `advanced`, `environment`, `variable`, `secrets` + +--- diff --git a/misc/feature-catalog/docs/categories/configuration.md b/misc/feature-catalog/docs/categories/configuration.md new file mode 100644 index 00000000..117bfc7f --- /dev/null +++ b/misc/feature-catalog/docs/categories/configuration.md @@ -0,0 +1,605 @@ +# Configuration + +Runtime configuration for controlling generation behavior, validation, performance tuning, alerts, and output paths. + +**29 features** in this category. + +## Table of Contents + +- [Feature Flags](#flags) (14 features) +- [Folder Paths](#folders) (7 features) +- [Generation Settings](#generation) (2 features) +- [Metadata Settings](#metadata) (1 features) +- [Streaming Settings](#streaming) (1 features) +- [Alert Settings](#alerts) (1 features) +- [Validation Runtime](#validation-runtime) (1 features) +- [Spark Runtime](#runtime) (1 features) +- [Sink Options](#sink) (1 features) + +## Feature Flags + +### Count Records + +**ID**: `configuration.flags.enablecount` +**Status**: Stable + +Count the number of records generated for each data source step. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableCount` | boolean | No | `true` | Count the number of records generated for each data source step. YAML: `config.flags.enableCount` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `count` + +--- + +### Generate Data + +**ID**: `configuration.flags.enablegeneratedata` +**Status**: Stable + +Enable or disable data generation. When false, only validation runs. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableGenerateData` | boolean | No | `true` | Enable or disable data generation. When false, only validation runs. YAML: `config.flags.enableGenerateData` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `generatedata` + +--- + +### Record Tracking + +**ID**: `configuration.flags.enablerecordtracking` +**Status**: Stable + +Track generated records for later cleanup/deletion. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableRecordTracking` | boolean | No | `false` | Track generated records for later cleanup/deletion. YAML: `config.flags.enableRecordTracking` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `recordtracking` + +--- + +### Delete Generated Records + +**ID**: `configuration.flags.enabledeletegeneratedrecords` +**Status**: Stable + +Enable cleanup mode to delete previously generated records. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableDeleteGeneratedRecords` | boolean | No | `false` | Enable cleanup mode to delete previously generated records. YAML: `config.flags.enableDeleteGeneratedRecords` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `deletegeneratedrecords` + +--- + +### Auto-Generate Plan and Tasks + +**ID**: `configuration.flags.enablegenerateplanandtasks` +**Status**: Stable + +Automatically generate plan and tasks from metadata sources. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableGeneratePlanAndTasks` | boolean | No | `false` | Automatically generate plan and tasks from metadata sources. YAML: `config.flags.enableGeneratePlanAndTasks` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `generateplanandtasks` + +--- + +### Fail on Error + +**ID**: `configuration.flags.enablefailonerror` +**Status**: Stable + +Fail execution immediately when errors occur. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableFailOnError` | boolean | No | `true` | Fail execution immediately when errors occur. YAML: `config.flags.enableFailOnError` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `failonerror` + +--- + +### Unique Check + +**ID**: `configuration.flags.enableuniquecheck` +**Status**: Stable + +Validate uniqueness constraints during data generation. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableUniqueCheck` | boolean | No | `false` | Validate uniqueness constraints during data generation. YAML: `config.flags.enableUniqueCheck` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `uniquecheck` + +--- + +### Sink Metadata + +**ID**: `configuration.flags.enablesinkmetadata` +**Status**: Stable + +Save metadata about generated data to the sink. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableSinkMetadata` | boolean | No | `false` | Save metadata about generated data to the sink. YAML: `config.flags.enableSinkMetadata` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `sinkmetadata` + +--- + +### Save Reports + +**ID**: `configuration.flags.enablesavereports` +**Status**: Stable + +Generate and save execution reports with generation and validation results. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableSaveReports` | boolean | No | `true` | Generate and save execution reports with generation and validation results. YAML: `config.flags.enableSaveReports` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `savereports` + +--- + +### Data Validation + +**ID**: `configuration.flags.enablevalidation` +**Status**: Stable + +Run data validations after generation completes. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableValidation` | boolean | No | `true` | Run data validations after generation completes. YAML: `config.flags.enableValidation` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `validation` + +--- + +### Suggest Validations + +**ID**: `configuration.flags.enablegeneratevalidations` +**Status**: Stable + +Auto-suggest validations based on data analysis. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableGenerateValidations` | boolean | No | `false` | Auto-suggest validations based on data analysis. YAML: `config.flags.enableGenerateValidations` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `generatevalidations` + +--- + +### Alerts + +**ID**: `configuration.flags.enablealerts` +**Status**: Stable + +Send alert notifications on completion (supports Slack). + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableAlerts` | boolean | No | `true` | Send alert notifications on completion (supports Slack). YAML: `config.flags.enableAlerts` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `alerts` + +--- + +### Unique Check Only In Batch + +**ID**: `configuration.flags.enableuniquecheckonlyinbatch` +**Status**: Stable + +Check uniqueness only within the current batch for better performance. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableUniqueCheckOnlyInBatch` | boolean | No | `false` | Check uniqueness only within the current batch for better performance. YAML: `config.flags.enableUniqueCheckOnlyInBatch` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `uniquecheckonlyinbatch` + +--- + +### Fast Generation + +**ID**: `configuration.flags.enablefastgeneration` +**Status**: Stable + +Use SQL-based generation for regex patterns instead of UDFs. Dramatically improves performance. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableFastGeneration` | boolean | No | `false` | Use SQL-based generation for regex patterns instead of UDFs. Dramatically improves performance. YAML: `config.flags.enableFastGeneration` | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala` (primary) + +**Tags**: `configuration`, `flag`, `fastgeneration` + +--- + +## Folder Paths + +### plan File + +**ID**: `configuration.folders.planfilepath` +**Status**: Stable + +Configuration path for planFilePath. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `planFilePath` | string | No | `-` | Path setting for planFilePath YAML: `config.folders.planFilePath` | + +**Tags**: `configuration`, `folder`, `path` + +--- + +### task Folder + +**ID**: `configuration.folders.taskfolderpath` +**Status**: Stable + +Configuration path for taskFolderPath. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `taskFolderPath` | string | No | `-` | Path setting for taskFolderPath YAML: `config.folders.taskFolderPath` | + +**Tags**: `configuration`, `folder`, `path` + +--- + +### generatedPlanAndTask Folder + +**ID**: `configuration.folders.generatedplanandtaskfolderpath` +**Status**: Stable + +Configuration path for generatedPlanAndTaskFolderPath. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `generatedPlanAndTaskFolderPath` | string | No | `-` | Path setting for generatedPlanAndTaskFolderPath YAML: `config.folders.generatedPlanAndTaskFolderPath` | + +**Tags**: `configuration`, `folder`, `path` + +--- + +### generatedReports Folder + +**ID**: `configuration.folders.generatedreportsfolderpath` +**Status**: Stable + +Configuration path for generatedReportsFolderPath. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `generatedReportsFolderPath` | string | No | `-` | Path setting for generatedReportsFolderPath YAML: `config.folders.generatedReportsFolderPath` | + +**Tags**: `configuration`, `folder`, `path` + +--- + +### recordTracking Folder + +**ID**: `configuration.folders.recordtrackingfolderpath` +**Status**: Stable + +Configuration path for recordTrackingFolderPath. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `recordTrackingFolderPath` | string | No | `-` | Path setting for recordTrackingFolderPath YAML: `config.folders.recordTrackingFolderPath` | + +**Tags**: `configuration`, `folder`, `path` + +--- + +### validation Folder + +**ID**: `configuration.folders.validationfolderpath` +**Status**: Stable + +Configuration path for validationFolderPath. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `validationFolderPath` | string | No | `-` | Path setting for validationFolderPath YAML: `config.folders.validationFolderPath` | + +**Tags**: `configuration`, `folder`, `path` + +--- + +### recordTrackingForValidation Folder + +**ID**: `configuration.folders.recordtrackingforvalidationfolderpath` +**Status**: Stable + +Configuration path for recordTrackingForValidationFolderPath. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `recordTrackingForValidationFolderPath` | string | No | `-` | Path setting for recordTrackingForValidationFolderPath YAML: `config.folders.recordTrackingForValidationFolderPath` | + +**Tags**: `configuration`, `folder`, `path` + +--- + +## Generation Settings + +### Batch Size + +**ID**: `configuration.generation.batch_size` +**Status**: Stable + +Control the number of records generated per batch. Affects memory usage and performance. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `numRecordsPerBatch` | long | No | `100000` | Records per batch YAML: `config.generation.numRecordsPerBatch` | +| `numRecordsPerStep` | long | No | `-` | Default records per step/table YAML: `config.generation.numRecordsPerStep` | + +**Tags**: `configuration`, `generation`, `batch`, `performance` + +--- + +### Bloom Filter Configuration + +**ID**: `configuration.generation.bloom_filter` +**Status**: Stable + +Configure bloom filter parameters for uniqueness checking during generation. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `uniqueBloomFilterNumItems` | long | No | `10000000` | Expected number of items in bloom filter YAML: `config.generation.uniqueBloomFilterNumItems` | +| `uniqueBloomFilterFalsePositiveProbability` | double | No | `0.01` | Bloom filter false positive rate (0-1) YAML: `config.generation.uniqueBloomFilterFalsePositiveProbability` | + +**Tags**: `configuration`, `generation`, `bloom-filter`, `uniqueness` + +--- + +## Metadata Settings + +### Metadata Analysis Configuration + +**ID**: `configuration.metadata` +**Status**: Stable + +Configure how metadata is sampled and analyzed for auto-generation of field patterns. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `numRecordsFromDataSource` | integer | No | `10000` | Sample size from data source YAML: `config.metadata.numRecordsFromDataSource` | +| `numRecordsForAnalysis` | integer | No | `10000` | Records analyzed for pattern detection YAML: `config.metadata.numRecordsForAnalysis` | +| `oneOfDistinctCountVsCountThreshold` | double | No | `0.2` | Threshold for detecting oneOf fields YAML: `config.metadata.oneOfDistinctCountVsCountThreshold` | +| `oneOfMinCount` | long | No | `1000` | Minimum records for oneOf detection YAML: `config.metadata.oneOfMinCount` | +| `numGeneratedSamples` | integer | No | `10` | Number of sample records in metadata suggestions YAML: `config.metadata.numGeneratedSamples` | + +**Tags**: `configuration`, `metadata`, `analysis`, `sampling` + +--- + +## Streaming Settings + +### Streaming Configuration + +**ID**: `configuration.streaming` +**Status**: Stable + +Configure streaming/real-time data generation parameters. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `maxTimeoutSeconds` | integer | No | `3600` | Maximum streaming timeout | +| `maxAsyncParallelism` | integer | No | `100` | Maximum async parallelism | +| `responseBufferSize` | integer | No | `10000` | Response buffer size for streaming | +| `timestampWindowMs` | long | No | `1000` | Timestamp window in milliseconds | + +**Tags**: `configuration`, `streaming`, `real-time`, `performance` + +--- + +## Alert Settings + +### Alert Configuration + +**ID**: `configuration.alerts` +**Status**: Stable + +Configure alert notifications triggered on execution completion. Supports Slack integration. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `triggerOn` | enum | No | `all` | When to trigger alerts Values: `all`, `failure`, `success`, `generation_failure`, `validation_failure`, `generation_success`, `validation_success` YAML: `config.alert.triggerOn` | +| `slackToken` | string | No | `-` | Slack API token YAML: `config.alert.slackToken` | +| `slackChannels` | array | No | `-` | Slack channels to notify YAML: `config.alert.slackChannels` | + +**Tags**: `configuration`, `alert`, `notification`, `slack` + +--- + +## Validation Runtime + +### Validation Runtime Configuration + +**ID**: `configuration.validation` +**Status**: Stable + +Configure validation execution behavior. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `numSampleErrorRecords` | integer | No | `5` | Number of sample error records in reports YAML: `config.validation.numSampleErrorRecords` | +| `enableDeleteRecordTrackingFiles` | boolean | No | `true` | Delete tracking files after validation YAML: `config.validation.enableDeleteRecordTrackingFiles` | + +**Tags**: `configuration`, `validation`, `runtime` + +--- + +## Spark Runtime + +### Apache Spark Configuration + +**ID**: `configuration.runtime.spark` +**Status**: Stable + +Configure the Apache Spark runtime for data processing. Set master URL, driver/executor memory, and Spark SQL settings. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `master` | string | No | `local[*]` | Spark master URL YAML: `config.runtime.master` | +| `sparkConfig` | object | No | `-` | Spark configuration key-value pairs YAML: `config.runtime.sparkConfig` | + +**Examples**: + +**Spark configuration**: +```yaml +config: + runtime: + master: "local[4]" + sparkConfig: + "spark.driver.memory": "4g" + "spark.sql.shuffle.partitions": "10" +``` + +**Tags**: `configuration`, `runtime`, `spark`, `performance` + +--- + +## Sink Options + +### Global Sink Options + +**ID**: `configuration.sink_options` +**Status**: Stable + +Global options for data output: random seed for reproducibility and locale for data generation. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `seed` | string | No | `-` | Random seed for reproducible generation YAML: `sinkOptions.seed` | +| `locale` | string | No | `-` | Locale for data generation (affects names, addresses) YAML: `sinkOptions.locale` | + +**Examples**: + +**Sink options**: +```yaml +sinkOptions: + seed: "42" + locale: "en-US" +``` + +**Tags**: `configuration`, `sink`, `seed`, `locale` + +--- diff --git a/misc/feature-catalog/docs/categories/connectors.md b/misc/feature-catalog/docs/categories/connectors.md new file mode 100644 index 00000000..60d9f4ad --- /dev/null +++ b/misc/feature-catalog/docs/categories/connectors.md @@ -0,0 +1,486 @@ +# Data Source Connectors + +Data Caterer supports connecting to databases, file systems, messaging systems, and HTTP APIs for reading and writing test data. + +**16 features** in this category. + +## Table of Contents + +- [Databases](#databases) (4 features) +- [File Formats](#files) (8 features) +- [Messaging Systems](#messaging) (3 features) +- [HTTP/REST](#http) (1 features) + +## Databases + +### PostgreSQL Connector + +**ID**: `connector.databases.postgres` +**Status**: Stable + +Connect to PostgreSQL databases for reading and writing data. Supports table-level configuration, custom queries, and JDBC options. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `url` | string | Yes | `-` | JDBC connection URL YAML: `dataSources[].connection.options.url` | +| `user` | string | No | `-` | Database username YAML: `dataSources[].connection.options.user` | +| `password` | string | No | `-` | Database password YAML: `dataSources[].connection.options.password` | +| `driver` | string | No | `org.postgresql.Driver` | JDBC driver class | +| `dbtable` | string | No | `-` | Target table (schema.table) YAML: `dataSources[].steps[].options.dbtable` | +| `query` | string | No | `-` | Custom SQL query for reading | + +**Examples**: + +**PostgreSQL data generation**: +```yaml +dataSources: + - name: my_postgres + connection: + type: postgres + options: + url: "jdbc:postgresql://localhost:5432/mydb" + user: "postgres" + password: "${POSTGRES_PASSWORD}" + steps: + - name: customers + options: + dbtable: "public.customers" + count: + records: 1000 +``` + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `database`, `jdbc`, `relational`, `sql` + +--- + +### MySQL Connector + +**ID**: `connector.databases.mysql` +**Status**: Stable + +Connect to MySQL databases for reading and writing data. Supports table-level configuration and JDBC options. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `url` | string | Yes | `-` | JDBC connection URL | +| `user` | string | No | `-` | Database username | +| `password` | string | No | `-` | Database password | +| `driver` | string | No | `com.mysql.cj.jdbc.Driver` | JDBC driver class | +| `dbtable` | string | No | `-` | Target table | + +**Examples**: + +**MySQL connection**: +```yaml +connection: + type: mysql + options: + url: "jdbc:mysql://localhost:3306/mydb" + user: "root" + password: "${MYSQL_PASSWORD}" +``` + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `database`, `jdbc`, `relational`, `sql` + +--- + +### Cassandra Connector + +**ID**: `connector.databases.cassandra` +**Status**: Stable + +Connect to Apache Cassandra for reading and writing data. Supports keyspace/table configuration, primary key and clustering positions. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `url` | string | Yes | `-` | Cassandra contact point URL | +| `user` | string | No | `-` | Cassandra username | +| `password` | string | No | `-` | Cassandra password | +| `keyspace` | string | Yes | `-` | Cassandra keyspace | +| `table` | string | Yes | `-` | Cassandra table | + +**Examples**: + +**Cassandra connection**: +```yaml +connection: + type: cassandra + options: + url: "localhost:9042" + user: "cassandra" + password: "cassandra" +``` + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `database`, `nosql`, `wide-column` + +--- + +### BigQuery Connector + +**ID**: `connector.databases.bigquery` +**Status**: Stable + +Connect to Google BigQuery for reading and writing data. Supports direct and indirect write methods. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `table` | string | Yes | `-` | BigQuery table (project.dataset.table) | +| `credentialsFile` | string | No | `-` | Path to GCP credentials JSON | +| `writeMethod` | string | No | `indirect` | Write method Values: `direct`, `indirect` | +| `temporaryGcsBucket` | string | No | `-` | GCS bucket for indirect writes | +| `queryJobPriority` | string | No | `batch` | Query job priority | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `database`, `cloud`, `google`, `data-warehouse` + +--- + +## File Formats + +### CSV File Connector + +**ID**: `connector.files.csv` +**Status**: Stable + +Read and write CSV files. Supports headers, delimiters, and other CSV-specific options. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `path` | string | Yes | `-` | File system path for CSV files YAML: `dataSources[].connection.options.path` | + +**Examples**: + +**CSV file output**: +```yaml +connection: + type: csv + options: + path: "/tmp/data/csv-output" +``` + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `file`, `csv`, `delimited`, `text` + +--- + +### JSON File Connector + +**ID**: `connector.files.json` +**Status**: Stable + +Read and write JSON files. Supports nested structures, arrays, and unwrapping top-level arrays. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `path` | string | Yes | `-` | File system path for JSON files | +| `unwrapTopLevelArray` | boolean | No | `false` | Output JSON as root-level array instead of object | + +**Examples**: + +**JSON file output**: +```yaml +connection: + type: json + options: + path: "/tmp/data/json-output" +``` + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `file`, `json`, `structured` + +--- + +### Parquet File Connector + +**ID**: `connector.files.parquet` +**Status**: Stable + +Read and write Apache Parquet columnar files. Efficient for large datasets. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `path` | string | Yes | `-` | File system path for Parquet files | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `file`, `parquet`, `columnar`, `binary` + +--- + +### ORC File Connector + +**ID**: `connector.files.orc` +**Status**: Stable + +Read and write Apache ORC columnar files. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `path` | string | Yes | `-` | File system path for ORC files | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `file`, `orc`, `columnar`, `binary` + +--- + +### Delta Lake Connector + +**ID**: `connector.files.delta` +**Status**: Stable + +Read and write Delta Lake tables. Supports ACID transactions, time travel, and schema evolution. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `path` | string | Yes | `-` | File system path for Delta tables | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `file`, `delta`, `lakehouse`, `acid` + +--- + +### Apache Iceberg Connector + +**ID**: `connector.files.iceberg` +**Status**: Stable + +Read and write Apache Iceberg tables. Supports multiple catalog types (Hadoop, Hive, REST, Glue, JDBC, Nessie). + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `path` | string | Yes | `-` | Table path | +| `catalogType` | string | No | `hadoop` | Iceberg catalog type Values: `hadoop`, `hive`, `rest`, `glue`, `jdbc`, `nessie` | +| `catalogUri` | string | No | `-` | Catalog URI (for hive/rest/nessie) | +| `catalogDefaultNamespace` | string | No | `-` | Default namespace | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `file`, `iceberg`, `lakehouse`, `catalog` + +--- + +### Apache Hudi Connector + +**ID**: `connector.files.hudi` +**Status**: Stable + +Read and write Apache Hudi tables. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `path` | string | Yes | `-` | Table path | +| `hoodie.table.name` | string | Yes | `-` | Hudi table name | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `file`, `hudi`, `lakehouse` + +--- + +### XML File Connector + +**ID**: `connector.files.xml` +**Status**: Stable + +Read and write XML files. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `path` | string | Yes | `-` | File system path for XML files | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `file`, `xml`, `structured` + +--- + +## Messaging Systems + +### Apache Kafka Connector + +**ID**: `connector.messaging.kafka` +**Status**: Stable + +Connect to Apache Kafka for producing and consuming messages. Supports topics, partitions, headers, key/value serialization, and streaming patterns. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `url` | string | Yes | `-` | Kafka bootstrap servers YAML: `dataSources[].connection.options.url` | +| `topic` | string | Yes | `-` | Kafka topic name | +| `schemaLocation` | string | No | `-` | Schema registry URL or file path | + +**Examples**: + +**Kafka streaming**: +```yaml +dataSources: + - name: my_kafka + connection: + type: kafka + options: + url: "localhost:9092" + steps: + - name: orders_topic + options: + topic: "orders" + count: + duration: "1m" + rate: 100 + rateUnit: "second" +``` + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `messaging`, `kafka`, `streaming`, `event` + +--- + +### Solace JMS Connector + +**ID**: `connector.messaging.solace` +**Status**: Stable + +Connect to Solace PubSub+ message broker via JMS. Supports queues and topics. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `url` | string | Yes | `-` | Solace broker URL | +| `user` | string | No | `-` | Username | +| `password` | string | No | `-` | Password | +| `vpnName` | string | No | `default` | VPN name | +| `connectionFactory` | string | No | `/jms/cf/default` | JNDI connection factory | +| `initialContextFactory` | string | No | `-` | JNDI context factory | +| `destinationName` | string | Yes | `-` | Queue/topic destination | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `messaging`, `jms`, `solace` + +--- + +### RabbitMQ Connector + +**ID**: `connector.messaging.rabbitmq` +**Status**: Stable + +Connect to RabbitMQ message broker via JMS. Supports queues. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `url` | string | Yes | `-` | RabbitMQ URL | +| `user` | string | No | `guest` | Username | +| `password` | string | No | `guest` | Password | +| `virtualHost` | string | No | `/` | Virtual host | +| `destinationName` | string | Yes | `-` | Queue name | + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `messaging`, `rabbitmq`, `jms`, `amqp` + +--- + +## HTTP/REST + +### HTTP/REST API Connector + +**ID**: `connector.http.http` +**Status**: Stable + +Send generated data to HTTP/REST APIs. Supports custom methods, headers, URL path parameters, query parameters, and request bodies. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `url` | string | Yes | `-` | Base URL for HTTP requests | + +**Examples**: + +**HTTP API data generation**: +```yaml +dataSources: + - name: my_api + connection: + type: http + options: + url: "http://localhost:8080" + steps: + - name: create_users + fields: + - name: httpUrl + type: struct + fields: + - name: url + static: "http://localhost:8080/api/users" + - name: method + static: "POST" + - name: httpBody + type: struct + fields: + - name: name + options: + expression: "#{Name.fullName}" +``` + +**Source Files**: +- `api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala` (supporting) + +**Tags**: `http`, `rest`, `api`, `web` + +--- diff --git a/misc/feature-catalog/docs/categories/generation.md b/misc/feature-catalog/docs/categories/generation.md new file mode 100644 index 00000000..32debee4 --- /dev/null +++ b/misc/feature-catalog/docs/categories/generation.md @@ -0,0 +1,1191 @@ +# Data Generation + +Comprehensive data generation capabilities including regex patterns, faker expressions, SQL computations, and field-level configuration options. + +**55 features** in this category. + +## Table of Contents + +- [Generator Types](#generators) (12 features) +- [Data Types](#data-types) (13 features) +- [Field Options](#field-options) (17 features) +- [Field Labels (Auto-Detection)](#labels) (13 features) + +## Generator Types + +### Regex Pattern Generation + +**ID**: `generation.field.regex` +**Status**: Stable + +Generate string values matching a regular expression pattern. Supports SQL-based optimization for common patterns with automatic fallback to UDF for complex patterns (lookaheads, backreferences). + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `regex` | string | Yes | `-` | Regular expression pattern to generate values from YAML: `fields[].options.regex` | + +**Examples**: + +**Simple regex pattern**: +```yaml +- name: account_id + options: + regex: "ACC[0-9]{8}" +``` + +**Alphanumeric pattern**: +```yaml +- name: product_code + options: + regex: "[A-Z]{3}-[0-9]{4}" +``` + +**Scala API**: +```scala +field.name("account_id").regex("ACC[0-9]{8}") +``` + +**Source Files**: +- `app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/regex/RegexPatternParser.scala` (primary) + +**Related Features**: +- `configuration.flags.enable_fast_generation` + +**Tags**: `generation`, `string`, `pattern`, `regex` + +**Performance Notes**: +- SQL-based optimization available via enableFastGeneration flag +- Complex patterns (lookaheads, backreferences) automatically fall back to UDF + +--- + +### DataFaker Expression + +**ID**: `generation.field.expression` +**Status**: Stable + +Generate realistic fake data using DataFaker library expressions. Supports names, addresses, emails, phone numbers, and hundreds of other data types. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `expression` | string | Yes | `-` | DataFaker expression (e.g., #{Name.firstName}) YAML: `fields[].options.expression` | + +**Examples**: + +**Full name generation**: +```yaml +- name: full_name + options: + expression: "#{Name.fullName}" +``` + +**Email generation**: +```yaml +- name: email + options: + expression: "#{Internet.emailAddress}" +``` + +**Scala API**: +```scala +field.name("email").expression("#{Internet.emailAddress}") +``` + +**Tags**: `generation`, `faker`, `realistic`, `expression` + +--- + +### One-Of Selection + +**ID**: `generation.field.one_of` +**Status**: Stable + +Generate values by randomly selecting from a predefined list of options. Useful for categorical data like statuses, types, and enums. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `oneOf` | array | Yes | `-` | List of values to randomly select from YAML: `fields[].options.oneOf` | + +**Examples**: + +**Enum field**: +```yaml +- name: status + options: + oneOf: ["active", "inactive", "pending"] +``` + +**Scala API**: +```scala +field.name("status").oneOf("active", "inactive", "pending") +``` + +**Tags**: `generation`, `enum`, `categorical`, `selection` + +--- + +### SQL Expression + +**ID**: `generation.field.sql` +**Status**: Stable + +Generate field values using Spark SQL expressions. Supports referencing other fields, date functions, string operations, aggregations, and conditional logic. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `sql` | string | Yes | `-` | Spark SQL expression for computed value YAML: `fields[].options.sql` | + +**Examples**: + +**Extract year from date field**: +```yaml +- name: year + type: integer + options: + sql: "YEAR(created_at)" +``` + +**Concatenate fields**: +```yaml +- name: full_name + type: string + options: + sql: "CONCAT(first_name, ' ', last_name)" +``` + +**Computed field**: +```yaml +- name: total_amount + type: double + options: + sql: "quantity * unit_price" +``` + +**Tags**: `generation`, `sql`, `computed`, `derived` + +--- + +### Static Value + +**ID**: `generation.field.static` +**Status**: Stable + +Set a fixed static value for all generated records. Useful for constant fields like API endpoints, methods, or content types. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `static` | string | Yes | `-` | Fixed value for all records YAML: `fields[].static` | + +**Examples**: + +**Static HTTP method**: +```yaml +- name: method + static: "POST" +``` + +**Tags**: `generation`, `static`, `constant` + +--- + +### UUID Generation + +**ID**: `generation.field.uuid` +**Status**: Stable + +Generate universally unique identifiers (UUID v4). + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `uuidPattern` | boolean | No | `false` | Enable UUID generation YAML: `fields[].options.uuidPattern` | + +**Examples**: + +**UUID field**: +```yaml +- name: id + options: + uuidPattern: true +``` + +**Tags**: `generation`, `uuid`, `identifier`, `unique` + +--- + +### Sequential Value Generation + +**ID**: `generation.field.sequence` +**Status**: Stable + +Generate sequential values with optional prefix and padding. Useful for IDs, batch numbers, and sequential identifiers. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `sequence` | object | Yes | `-` | Sequential value configuration with prefix and padding YAML: `fields[].options.sequence` | + +**Examples**: + +**Sequential order IDs**: +```yaml +- name: order_id + options: + sequence: + start: 1000 + step: 1 + prefix: "ORD-" + padding: 8 +``` + +**Tags**: `generation`, `sequence`, `sequential`, `incremental` + +--- + +### Conditional Value Generation + +**ID**: `generation.field.conditional_value` +**Status**: Stable + +Generate values using CASE WHEN logic based on other field values. Enables dependent field generation. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `conditionalValue` | object | Yes | `-` | CASE WHEN conditions and result values YAML: `fields[].options.conditionalValue` | + +**Examples**: + +**Conditional discount**: +```yaml +- name: discount + type: double + options: + conditionalValue: + conditions: + - expr: "customer_type = 'premium'" + value: 0.2 + - expr: "customer_type = 'standard'" + value: 0.1 + default: 0.0 +``` + +**Tags**: `generation`, `conditional`, `logic`, `derived` + +--- + +### Correlated Field Generation + +**ID**: `generation.field.correlated` +**Status**: Stable + +Generate values that are correlated (or negatively correlated) with another field. Useful for creating realistic relationships between numeric fields. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `correlatedWith` | string | No | `-` | Field name to correlate with YAML: `fields[].options.correlatedWith` | +| `negativelyCorrelatedWith` | string | No | `-` | Field name to negatively correlate with YAML: `fields[].options.negativelyCorrelatedWith` | + +**Examples**: + +**Positively correlated fields**: +```yaml +- name: revenue + type: double + options: + correlatedWith: "customer_count" +``` + +**Tags**: `generation`, `correlation`, `statistical`, `relationship` + +--- + +### Value Mapping + +**ID**: `generation.field.mapping` +**Status**: Stable + +Map values from one field to generate deterministic output in another field. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `mapping` | object | Yes | `-` | Mapping configuration from source field to output values YAML: `fields[].options.mapping` | + +**Examples**: + +**Country code mapping**: +```yaml +- name: country_code + options: + mapping: + sourceField: "country" + mappings: + "United States": "US" + "United Kingdom": "UK" +``` + +**Tags**: `generation`, `mapping`, `lookup`, `derived` + +--- + +### Semantic Version Generation + +**ID**: `generation.field.semantic_version` +**Status**: Stable + +Generate semantic version strings (e.g., 1.2.3). + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `semanticVersion` | object | No | `-` | Semantic version configuration YAML: `fields[].options.semanticVersion` | + +**Tags**: `generation`, `version`, `semver` + +--- + +### Daily Batch Sequence + +**ID**: `generation.field.daily_batch_sequence` +**Status**: Stable + +Generate daily batch sequence identifiers. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `dailyBatchSequence` | object | No | `-` | Daily batch sequence configuration YAML: `fields[].options.dailyBatchSequence` | + +**Tags**: `generation`, `batch`, `daily`, `sequence` + +--- + +## Data Types + +### String Type + +**ID**: `generation.type.string` +**Status**: Stable + +Text data type. Default field type. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "string" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `string` + +--- + +### Integer Type + +**ID**: `generation.type.integer` +**Status**: Stable + +32-bit integer values. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "integer" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `integer` + +--- + +### Long Type + +**ID**: `generation.type.long` +**Status**: Stable + +64-bit long integer values. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "long" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `long` + +--- + +### Double Type + +**ID**: `generation.type.double` +**Status**: Stable + +Double-precision floating point values. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "double" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `double` + +--- + +### Float Type + +**ID**: `generation.type.float` +**Status**: Stable + +Single-precision floating point values. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "float" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `float` + +--- + +### Decimal Type + +**ID**: `generation.type.decimal` +**Status**: Stable + +Fixed-precision decimal values with configurable precision and scale. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "decimal" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `decimal` + +--- + +### Boolean Type + +**ID**: `generation.type.boolean` +**Status**: Stable + +True/false boolean values. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "boolean" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `boolean` + +--- + +### Date Type + +**ID**: `generation.type.date` +**Status**: Stable + +Date values (year-month-day). + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "date" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `date` + +--- + +### Timestamp Type + +**ID**: `generation.type.timestamp` +**Status**: Stable + +Timestamp values with date and time. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "timestamp" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `timestamp` + +--- + +### Binary Type + +**ID**: `generation.type.binary` +**Status**: Stable + +Binary byte array values. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "binary" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `binary` + +--- + +### Array Type + +**ID**: `generation.type.array` +**Status**: Stable + +Array/list of elements. Configurable element type, min/max length. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "array" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `array` + +--- + +### Struct Type + +**ID**: `generation.type.struct` +**Status**: Stable + +Nested structure with named fields. Supports deep nesting. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "struct" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `struct` + +--- + +### Map Type + +**ID**: `generation.type.map` +**Status**: Stable + +Key-value map type with configurable key and value types. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set field type to "map" YAML: `fields[].type` | + +**Tags**: `generation`, `type`, `map` + +--- + +## Field Options + +### Numeric Range + +**ID**: `generation.option.numeric_range` +**Status**: Stable + +Constrain numeric fields (integer, long, double, float, decimal) to a minimum and maximum range. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `min` | any | No | `-` | Minimum value (inclusive) YAML: `fields[].options.min` | +| `max` | any | No | `-` | Maximum value (inclusive) YAML: `fields[].options.max` | + +**Examples**: + +**Integer range**: +```yaml +- name: age + type: integer + options: + min: 18 + max: 120 +``` + +**Double range**: +```yaml +- name: price + type: double + options: + min: 9.99 + max: 999.99 +``` + +**Tags**: `generation`, `numeric`, `range`, `constraint` + +--- + +### Date/Time Range + +**ID**: `generation.option.date_range` +**Status**: Stable + +Constrain date and timestamp fields to a minimum and maximum range. Also supports excluding weekends, business hours, within/future days. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `min` | string | No | `-` | Minimum date/timestamp YAML: `fields[].options.min` | +| `max` | string | No | `-` | Maximum date/timestamp YAML: `fields[].options.max` | +| `excludeWeekends` | boolean | No | `false` | Exclude Saturday and Sunday YAML: `fields[].options.excludeWeekends` | +| `withinDays` | integer | No | `-` | Generate dates within last N days from now YAML: `fields[].options.withinDays` | +| `futureDays` | integer | No | `-` | Generate dates within next N days from now YAML: `fields[].options.futureDays` | +| `businessHours` | boolean | No | `false` | Restrict to business hours YAML: `fields[].options.businessHours` | +| `timeBetween` | object | No | `-` | Generate times between start and end YAML: `fields[].options.timeBetween` | + +**Examples**: + +**Timestamp range**: +```yaml +- name: created_at + type: timestamp + options: + min: "2024-01-01T00:00:00" + max: "2024-12-31T23:59:59" +``` + +**Tags**: `generation`, `date`, `timestamp`, `range` + +--- + +### Null Value Control + +**ID**: `generation.option.null_handling` +**Status**: Stable + +Control whether and how often null values appear in generated data. Configurable null probability per field. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableNull` | boolean | No | `false` | Allow null values for this field YAML: `fields[].options.enableNull` | +| `nullProb` | double | No | `-` | Probability of generating a null value (0-1) YAML: `fields[].options.nullProb` | +| `nullable` | boolean | No | `true` | Whether the field schema allows nulls YAML: `fields[].nullable` | + +**Examples**: + +**30% null probability**: +```yaml +- name: middle_name + options: + enableNull: true + nullProb: 0.3 +``` + +**Tags**: `generation`, `null`, `nullable`, `probability` + +--- + +### Edge Case Generation + +**ID**: `generation.option.edge_cases` +**Status**: Stable + +Control the probability of generating edge case values (empty strings, boundary values, special characters). + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `enableEdgeCase` | boolean | No | `false` | Enable edge case generation | +| `edgeCaseProb` | double | No | `-` | Probability of generating edge case values (0-1) | + +**Tags**: `generation`, `edge-case`, `boundary`, `testing` + +--- + +### String Length Control + +**ID**: `generation.option.string_length` +**Status**: Stable + +Control the length of generated string values with minimum, maximum, and average length. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `minLen` | integer | No | `-` | Minimum string length | +| `maxLen` | integer | No | `-` | Maximum string length | +| `avgLen` | integer | No | `-` | Average string length | + +**Tags**: `generation`, `string`, `length`, `constraint` + +--- + +### Array Configuration + +**ID**: `generation.option.array_config` +**Status**: Stable + +Configure array field generation: element count, element type, uniqueness, empty probability, and weighted selection. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `arrayMinLen` | integer | No | `-` | Minimum array length YAML: `fields[].options.arrayMinLength` | +| `arrayMaxLen` | integer | No | `-` | Maximum array length YAML: `fields[].options.arrayMaxLength` | +| `arrayFixedSize` | integer | No | `-` | Fixed array size | +| `arrayEmptyProb` | double | No | `-` | Probability of empty array (0-1) YAML: `fields[].options.arrayEmptyProbability` | +| `arrayType` | string | No | `-` | Element data type for array | +| `arrayOneOf` | string | No | `-` | Comma-separated values for array elements | +| `arrayUniqueFrom` | string | No | `-` | Source for unique array elements | +| `arrayWeightedOneOf` | string | No | `-` | Weighted selection for elements (e.g., HIGH:0.2,MEDIUM:0.5,LOW:0.3) YAML: `fields[].options.arrayWeightedOneOf` | + +**Tags**: `generation`, `array`, `collection`, `nested` + +--- + +### Map Configuration + +**ID**: `generation.option.map_config` +**Status**: Stable + +Configure map field generation with minimum and maximum size. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `mapMinSize` | integer | No | `-` | Minimum number of entries | +| `mapMaxSize` | integer | No | `-` | Maximum number of entries | + +**Tags**: `generation`, `map`, `key-value`, `nested` + +--- + +### Value Distribution + +**ID**: `generation.option.distribution` +**Status**: Stable + +Control the statistical distribution of generated numeric values. Supports uniform, normal, and exponential distributions. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `distribution` | enum | No | `-` | Distribution type Values: `uniform`, `normal`, `exponential` | +| `mean` | double | No | `-` | Mean for normal distribution | +| `stddev` | double | No | `-` | Standard deviation for normal distribution | +| `distributionRateParam` | double | No | `-` | Rate parameter for exponential distribution | + +**Tags**: `generation`, `distribution`, `statistical`, `normal`, `uniform` + +--- + +### Uniqueness Constraint + +**ID**: `generation.option.uniqueness` +**Status**: Stable + +Enforce unique values for a field using bloom filter-based deduplication. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `isUnique` | boolean | No | `false` | Enforce unique values YAML: `fields[].options.isUnique` | +| `isPrimaryKey` | boolean | No | `false` | Mark as primary key (implies unique) YAML: `fields[].options.isPrimaryKey` | +| `primaryKeyPos` | integer | No | `-` | Position in composite primary key | + +**Related Features**: +- `configuration.flags.enable_unique_check` + +**Tags**: `generation`, `unique`, `primary-key`, `constraint` + +--- + +### Numeric Precision and Scale + +**ID**: `generation.option.numeric_precision` +**Status**: Stable + +Control precision and scale for decimal fields, and rounding for numeric fields. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `precision` | integer | No | `-` | Numeric precision (total digits) | +| `scale` | integer | No | `-` | Numeric scale (decimal places) | +| `round` | integer | No | `-` | Round numeric values to N decimal places | + +**Tags**: `generation`, `numeric`, `precision`, `decimal` + +--- + +### Field Omission + +**ID**: `generation.option.omit` +**Status**: Stable + +Generate a field for use in computed expressions but omit it from the final output. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `omit` | boolean | No | `false` | Omit field from output YAML: `fields[].options.omit` | + +**Tags**: `generation`, `omit`, `helper`, `computed` + +--- + +### Random Seed + +**ID**: `generation.option.seed` +**Status**: Stable + +Set a random seed for reproducible data generation per field. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `seed` | integer | No | `-` | Random seed for reproducible generation YAML: `fields[].options.seed` | + +**Tags**: `generation`, `seed`, `reproducible`, `deterministic` + +--- + +### Distinct Value Count + +**ID**: `generation.option.distinct_count` +**Status**: Stable + +Control how many distinct values are generated for a field. Used with metadata-driven generation. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `distinctCount` | integer | No | `-` | Number of distinct values to generate | +| `histogram` | object | No | `-` | Value distribution histogram | + +**Tags**: `generation`, `distinct`, `cardinality`, `metadata` + +--- + +### Cassandra Key Configuration + +**ID**: `generation.option.cassandra_keys` +**Status**: Stable + +Configure Cassandra-specific primary key and clustering positions for fields. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `isPrimaryKey` | boolean | No | `-` | Mark as partition key | +| `primaryKeyPos` | integer | No | `-` | Position in composite partition key | +| `clusteringPos` | integer | No | `-` | Clustering column position | + +**Tags**: `generation`, `cassandra`, `primary-key`, `clustering` + +--- + +### Incremental Generation + +**ID**: `generation.option.incremental` +**Status**: Stable + +Mark a field for incremental generation, tracking the last generated value across runs. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `incremental` | boolean | No | `false` | Enable incremental mode | + +**Tags**: `generation`, `incremental`, `tracking` + +--- + +### HTTP Parameter Type + +**ID**: `generation.option.http_param_type` +**Status**: Stable + +Specify the HTTP parameter type for a field when using the HTTP connector (path, query, or header). + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `httpParamType` | enum | No | `-` | HTTP parameter placement Values: `path`, `query`, `header` | + +**Tags**: `generation`, `http`, `parameter`, `api` + +--- + +### Post-SQL Expression + +**ID**: `generation.option.post_sql_expression` +**Status**: Stable + +Apply a SQL expression to transform the field value after initial generation. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `postSqlExpression` | string | No | `-` | SQL expression to apply after generation | + +**Tags**: `generation`, `sql`, `transform`, `post-processing` + +--- + +## Field Labels (Auto-Detection) + +### Name Label + +**ID**: `generation.label.name` +**Status**: Stable + +Generate person name fields (first name, last name, full name). Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "name" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `name` + +--- + +### Username Label + +**ID**: `generation.label.username` +**Status**: Stable + +Generate username fields. Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "username" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `username` + +--- + +### Address Label + +**ID**: `generation.label.address` +**Status**: Stable + +Generate address fields (street, city, postcode). Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "address" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `address` + +--- + +### Application Label + +**ID**: `generation.label.app` +**Status**: Stable + +Generate application-related fields (version). Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "app" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `app` + +--- + +### Nation Label + +**ID**: `generation.label.nation` +**Status**: Stable + +Generate nationality, language, capital city. Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "nation" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `nation` + +--- + +### Money Label + +**ID**: `generation.label.money` +**Status**: Stable + +Generate currency and financial fields. Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "money" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `money` + +--- + +### Internet Label + +**ID**: `generation.label.internet` +**Status**: Stable + +Generate email, IP, MAC address fields. Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "internet" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `internet` + +--- + +### Food Label + +**ID**: `generation.label.food` +**Status**: Stable + +Generate food and ingredient fields. Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "food" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `food` + +--- + +### Job Label + +**ID**: `generation.label.job` +**Status**: Stable + +Generate job title, field, position. Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "job" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `job` + +--- + +### Relationship Label + +**ID**: `generation.label.relationship` +**Status**: Stable + +Generate relationship type fields. Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "relationship" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `relationship` + +--- + +### Weather Label + +**ID**: `generation.label.weather` +**Status**: Stable + +Generate weather description fields. Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "weather" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `weather` + +--- + +### Phone Label + +**ID**: `generation.label.phone` +**Status**: Stable + +Generate phone number fields. Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "phone" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `phone` + +--- + +### Geo Label + +**ID**: `generation.label.geo` +**Status**: Stable + +Generate geographic coordinate fields. Used for metadata-driven field generation to automatically select appropriate data generators. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `label` | string | No | `-` | Set field label to "geo" for auto-detection | + +**Tags**: `generation`, `label`, `metadata`, `geo` + +--- diff --git a/misc/feature-catalog/docs/categories/metadata.md b/misc/feature-catalog/docs/categories/metadata.md new file mode 100644 index 00000000..a08a1e82 --- /dev/null +++ b/misc/feature-catalog/docs/categories/metadata.md @@ -0,0 +1,121 @@ +# Metadata Integration + +Import schemas and metadata from external catalogs, registries, and standards to drive automatic data generation. + +**10 features** in this category. + +## Table of Contents + +- [Metadata Sources](#sources) (10 features) + +## Metadata Sources + +### Marquez Integration + +**ID**: `metadata.source.marquez` +**Status**: Stable + +Apache Marquez open-source metadata service with OpenLineage support. + +**Tags**: `metadata`, `integration`, `marquez` + +--- + +### OpenMetadata Integration + +**ID**: `metadata.source.open_metadata` +**Status**: Stable + +OpenMetadata platform for metadata discovery. Supports multiple auth types (basic, Azure, Google, Okta, Auth0, AWS Cognito). + +**Tags**: `metadata`, `integration`, `open-metadata` + +--- + +### OpenAPI/Swagger Integration + +**ID**: `metadata.source.open_api` +**Status**: Stable + +Generate data from OpenAPI/Swagger specifications. + +**Tags**: `metadata`, `integration`, `open-api` + +--- + +### Great Expectations Integration + +**ID**: `metadata.source.great_expectations` +**Status**: Stable + +Import data quality expectations from Great Expectations suites. + +**Tags**: `metadata`, `integration`, `great-expectations` + +--- + +### Open Data Contract Standard Integration + +**ID**: `metadata.source.open_data_contract_standard` +**Status**: Stable + +Import schemas from ODCS format. + +**Tags**: `metadata`, `integration`, `open-data-contract-standard` + +--- + +### Data Contract CLI Integration + +**ID**: `metadata.source.data_contract_cli` +**Status**: Stable + +Import schemas from Data Contract CLI format. + +**Tags**: `metadata`, `integration`, `data-contract-cli` + +--- + +### Amundsen Integration + +**ID**: `metadata.source.amundsen` +**Status**: Stable + +Import metadata from Amundsen data catalog. + +**Tags**: `metadata`, `integration`, `amundsen` + +--- + +### DataHub Integration + +**ID**: `metadata.source.datahub` +**Status**: Stable + +Import metadata from DataHub data catalog. + +**Tags**: `metadata`, `integration`, `datahub` + +--- + +### Confluent Schema Registry Integration + +**ID**: `metadata.source.confluent_schema_registry` +**Status**: Stable + +Import schemas from Confluent Schema Registry (Avro, Protobuf, JSON Schema). + +**Tags**: `metadata`, `integration`, `confluent-schema-registry` + +--- + +### JSON Schema Integration + +**ID**: `metadata.source.json_schema` +**Status**: Stable + +Generate data from JSON Schema definitions. + +**Tags**: `metadata`, `integration`, `json-schema` + +--- diff --git a/misc/feature-catalog/docs/categories/ui-api.md b/misc/feature-catalog/docs/categories/ui-api.md new file mode 100644 index 00000000..0c3a8348 --- /dev/null +++ b/misc/feature-catalog/docs/categories/ui-api.md @@ -0,0 +1,95 @@ +# UI and API + +Web-based user interface for visual plan creation, execution management, and result viewing. + +**6 features** in this category. + +## Table of Contents + +- [Web UI Features](#web-ui) (6 features) + +## Web UI Features + +### Connection Management + +**ID**: `ui.connection_management` +**Status**: Stable + +Create, edit, test, and manage data source connections through the web UI. + +**Source Files**: +- `app/src/main/scala/io/github/datacatering/datacaterer/core/ui/` (primary) + +**Tags**: `ui`, `web`, `connection-management` + +--- + +### Interactive Plan Creation + +**ID**: `ui.plan_creation` +**Status**: Stable + +Build data generation plans interactively with visual field configuration. + +**Source Files**: +- `app/src/main/scala/io/github/datacatering/datacaterer/core/ui/` (primary) + +**Tags**: `ui`, `web`, `plan-creation` + +--- + +### Execution History + +**ID**: `ui.execution_history` +**Status**: Stable + +View past execution runs with status, timing, and record counts. + +**Source Files**: +- `app/src/main/scala/io/github/datacatering/datacaterer/core/ui/` (primary) + +**Tags**: `ui`, `web`, `execution-history` + +--- + +### Real-time Results + +**ID**: `ui.results_viewing` +**Status**: Stable + +View generation and validation results in real-time during execution. + +**Source Files**: +- `app/src/main/scala/io/github/datacatering/datacaterer/core/ui/` (primary) + +**Tags**: `ui`, `web`, `results-viewing` + +--- + +### Sample Data Generation + +**ID**: `ui.sample_data` +**Status**: Stable + +Preview generated sample data before running full generation. + +**Source Files**: +- `app/src/main/scala/io/github/datacatering/datacaterer/core/ui/` (primary) + +**Tags**: `ui`, `web`, `sample-data` + +--- + +### Report Generation + +**ID**: `ui.report_generation` +**Status**: Stable + +Generate detailed HTML reports with generation statistics and validation results. + +**Source Files**: +- `app/src/main/scala/io/github/datacatering/datacaterer/core/ui/` (primary) + +**Tags**: `ui`, `web`, `report-generation` + +--- diff --git a/misc/feature-catalog/docs/categories/validation.md b/misc/feature-catalog/docs/categories/validation.md new file mode 100644 index 00000000..d0cfcb77 --- /dev/null +++ b/misc/feature-catalog/docs/categories/validation.md @@ -0,0 +1,900 @@ +# Data Validation + +Over 30 validation types for verifying generated data quality, schema compliance, statistical properties, and cross-source consistency. + +**42 features** in this category. + +## Table of Contents + +- [Field-Level Validations](#field-validations) (29 features) +- [Statistical Validations](#statistical-validations) (8 features) +- [Expression Validations](#expression-validations) (1 features) +- [Aggregation Validations](#aggregation-validations) (1 features) +- [Cross-Source Validations](#cross-source-validations) (1 features) +- [Schema Validations](#schema-validations) (1 features) +- [Wait Conditions](#wait-conditions) (1 features) + +## Field-Level Validations + +### Null Check + +**ID**: `validation.field.null` +**Status**: Stable + +Validate that a field is null (or not null with negate=true). + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "null" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `null` + +--- + +### Unique Values + +**ID**: `validation.field.unique` +**Status**: Stable + +Validate that all values in a field are unique. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "unique" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `unique` + +--- + +### Equality Check + +**ID**: `validation.field.equal` +**Status**: Stable + +Validate that field values equal a specified value. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "equal" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `equal` + +--- + +### Contains Check + +**ID**: `validation.field.contains` +**Status**: Stable + +Validate that string field values contain a substring. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "contains" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `contains` + +--- + +### Starts With + +**ID**: `validation.field.starts_with` +**Status**: Stable + +Validate that string field values start with a prefix. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "startswith" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `starts-with` + +--- + +### Ends With + +**ID**: `validation.field.ends_with` +**Status**: Stable + +Validate that string field values end with a suffix. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "endswith" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `ends-with` + +--- + +### Less Than + +**ID**: `validation.field.less_than` +**Status**: Stable + +Validate that numeric values are less than a threshold. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "lessthan" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `less-than` + +--- + +### Greater Than + +**ID**: `validation.field.greater_than` +**Status**: Stable + +Validate that numeric values are greater than a threshold. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "greaterthan" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `greater-than` + +--- + +### Between Range + +**ID**: `validation.field.between` +**Status**: Stable + +Validate that values fall within a min/max range (inclusive). + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "between" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `between` + +--- + +### In Set + +**ID**: `validation.field.in` +**Status**: Stable + +Validate that values exist in a specified set of allowed values. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "in" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `in` + +--- + +### Regex Match + +**ID**: `validation.field.matches` +**Status**: Stable + +Validate that string values match a regular expression pattern. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "matches" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `matches` + +--- + +### Regex Match List + +**ID**: `validation.field.matches_list` +**Status**: Stable + +Validate that string values match one of multiple regex patterns. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "matcheslist" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `matches-list` + +--- + +### Size Check + +**ID**: `validation.field.size` +**Status**: Stable + +Validate the size/length of a collection or string field. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "size" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `size` + +--- + +### Less Than Size + +**ID**: `validation.field.less_than_size` +**Status**: Stable + +Validate that collection size is less than a threshold. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "lessthansize" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `less-than-size` + +--- + +### Greater Than Size + +**ID**: `validation.field.greater_than_size` +**Status**: Stable + +Validate that collection size is greater than a threshold. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "greaterthansize" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `greater-than-size` + +--- + +### Length Between + +**ID**: `validation.field.length_between` +**Status**: Stable + +Validate that string length falls within a range. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "lengthbetween" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `length-between` + +--- + +### Length Equal + +**ID**: `validation.field.length_equal` +**Status**: Stable + +Validate that string length equals a specific value. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "lengthequal" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `length-equal` + +--- + +### Luhn Check + +**ID**: `validation.field.luhn_check` +**Status**: Stable + +Validate values using the Luhn algorithm (credit cards, IDs). + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "luhncheck" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `luhn-check` + +--- + +### Type Check + +**ID**: `validation.field.has_type` +**Status**: Stable + +Validate that field values are of a specific data type. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "hastype" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `has-type` + +--- + +### Multi-Type Check + +**ID**: `validation.field.has_types` +**Status**: Stable + +Validate that field values match one of multiple types. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "hastypes" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `has-types` + +--- + +### Monotonically Decreasing + +**ID**: `validation.field.is_decreasing` +**Status**: Stable + +Validate that values are in decreasing order. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "isdecreasing" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `is-decreasing` + +--- + +### Monotonically Increasing + +**ID**: `validation.field.is_increasing` +**Status**: Stable + +Validate that values are in increasing order. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "isincreasing" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `is-increasing` + +--- + +### JSON Parsable + +**ID**: `validation.field.is_json_parsable` +**Status**: Stable + +Validate that string values are valid JSON. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "isjsonparsable" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `is-json-parsable` + +--- + +### JSON Schema Match + +**ID**: `validation.field.match_json_schema` +**Status**: Stable + +Validate that JSON values conform to a JSON schema. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "matchjsonschema" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `match-json-schema` + +--- + +### DateTime Format Match + +**ID**: `validation.field.match_date_time_format` +**Status**: Stable + +Validate that values match a specific date/time format. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "matchdatetimeformat" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `match-date-time-format` + +--- + +### Distinct In Set + +**ID**: `validation.field.distinct_in_set` +**Status**: Stable + +Validate that all distinct values exist in a specified set. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "distinctinset" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `distinct-in-set` + +--- + +### Distinct Contains Set + +**ID**: `validation.field.distinct_contains_set` +**Status**: Stable + +Validate that distinct values contain all values from a specified set. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "distinctcontainsset" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `distinct-contains-set` + +--- + +### Distinct Equal + +**ID**: `validation.field.distinct_equal` +**Status**: Stable + +Validate that the set of distinct values exactly equals a specified set. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "distinctequal" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `distinct-equal` + +--- + +### Most Common Value + +**ID**: `validation.field.most_common_value_in_set` +**Status**: Stable + +Validate that the most common value is in a specified set. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type to "mostcommonvalueinset" | +| `negate` | boolean | No | `false` | Invert the validation result | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | +| `description` | string | No | `-` | Human-readable description | + +**Tags**: `validation`, `field`, `most-common-value-in-set` + +--- + +## Statistical Validations + +### Max Between + +**ID**: `validation.statistical.max_between` +**Status**: Stable + +Validate that the maximum value of a field falls within a range. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type | +| `min` | any | No | `-` | Minimum expected value | +| `max` | any | No | `-` | Maximum expected value | + +**Tags**: `validation`, `statistical`, `max-between` + +--- + +### Mean Between + +**ID**: `validation.statistical.mean_between` +**Status**: Stable + +Validate that the mean value of a field falls within a range. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type | +| `min` | any | No | `-` | Minimum expected value | +| `max` | any | No | `-` | Maximum expected value | + +**Tags**: `validation`, `statistical`, `mean-between` + +--- + +### Median Between + +**ID**: `validation.statistical.median_between` +**Status**: Stable + +Validate that the median value of a field falls within a range. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type | +| `min` | any | No | `-` | Minimum expected value | +| `max` | any | No | `-` | Maximum expected value | + +**Tags**: `validation`, `statistical`, `median-between` + +--- + +### Min Between + +**ID**: `validation.statistical.min_between` +**Status**: Stable + +Validate that the minimum value of a field falls within a range. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type | +| `min` | any | No | `-` | Minimum expected value | +| `max` | any | No | `-` | Maximum expected value | + +**Tags**: `validation`, `statistical`, `min-between` + +--- + +### Std Dev Between + +**ID**: `validation.statistical.std_dev_between` +**Status**: Stable + +Validate that the standard deviation of a field falls within a range. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type | +| `min` | any | No | `-` | Minimum expected value | +| `max` | any | No | `-` | Maximum expected value | + +**Tags**: `validation`, `statistical`, `std-dev-between` + +--- + +### Sum Between + +**ID**: `validation.statistical.sum_between` +**Status**: Stable + +Validate that the sum of a field falls within a range. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type | +| `min` | any | No | `-` | Minimum expected value | +| `max` | any | No | `-` | Maximum expected value | + +**Tags**: `validation`, `statistical`, `sum-between` + +--- + +### Unique Values Proportion + +**ID**: `validation.statistical.unique_values_proportion_between` +**Status**: Stable + +Validate that the proportion of unique values falls within a range. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type | +| `min` | any | No | `-` | Minimum expected value | +| `max` | any | No | `-` | Maximum expected value | + +**Tags**: `validation`, `statistical`, `unique-values-proportion-between` + +--- + +### Quantile Values Between + +**ID**: `validation.statistical.quantile_values_between` +**Status**: Stable + +Validate that quantile values fall within specified ranges. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | string | No | `-` | Set validation type | +| `min` | any | No | `-` | Minimum expected value | +| `max` | any | No | `-` | Maximum expected value | + +**Tags**: `validation`, `statistical`, `quantile-values-between` + +--- + +## Expression Validations + +### SQL Expression Validation + +**ID**: `validation.expression` +**Status**: Stable + +Validate data using arbitrary Spark SQL expressions that must evaluate to true. The most flexible validation type. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `expr` | string | Yes | `-` | SQL expression that must evaluate to true YAML: `validations[].expr` | +| `selectExpr` | array | No | `-` | SELECT columns for the expression YAML: `validations[].selectExpr` | +| `preFilterExpr` | string | No | `-` | SQL filter to apply before validation YAML: `validations[].preFilterExpr` | +| `description` | string | No | `-` | Human-readable description | +| `errorThreshold` | double | No | `-` | Allowed error rate (0-1) | + +**Examples**: + +**Expression validation**: +```yaml +validations: + - expr: "age >= 18 AND age <= 120" + description: "Age must be valid" +``` + +**Tags**: `validation`, `expression`, `sql`, `flexible` + +--- + +## Aggregation Validations + +### Group By Aggregation Validation + +**ID**: `validation.group_by` +**Status**: Stable + +Validate aggregated data grouped by specified fields. Supports sum, avg, min, max, count, and stddev aggregations. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `groupByFields` | array | Yes | `-` | Fields to group by YAML: `validations[].groupByFields` | +| `aggField` | string | No | `-` | Field to aggregate YAML: `validations[].aggField` | +| `aggType` | enum | No | `-` | Aggregation function Values: `sum`, `avg`, `min`, `max`, `count`, `stddev` YAML: `validations[].aggType` | +| `aggExpr` | string | No | `-` | Custom aggregation expression YAML: `validations[].aggExpr` | + +**Examples**: + +**Group by validation**: +```yaml +validations: + - groupByFields: ["status"] + aggField: "balance" + aggType: "avg" + aggExpr: "avg_balance > 0" + description: "Average balance per status" +``` + +**Tags**: `validation`, `aggregation`, `group-by`, `statistical` + +--- + +## Cross-Source Validations + +### Upstream Cross-Source Validation + +**ID**: `validation.upstream` +**Status**: Stable + +Validate data by joining with an upstream data source. Enables cross-system data consistency checks. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `upstreamDataSource` | string | Yes | `-` | Upstream data source name YAML: `validations[].upstreamDataSource` | +| `upstreamReadOptions` | object | No | `-` | Read options for upstream source YAML: `validations[].upstreamReadOptions` | +| `joinFields` | array | No | `-` | Fields to join on YAML: `validations[].joinFields` | +| `joinType` | enum | No | `outer` | Join type Values: `inner`, `left`, `right`, `full`, `anti`, `semi` YAML: `validations[].joinType` | + +**Examples**: + +**Cross-source validation**: +```yaml +validations: + - upstreamDataSource: "source_json" + joinFields: ["account_id"] + joinType: "outer" + validations: + - expr: "source_json_name == name" +``` + +**Tags**: `validation`, `upstream`, `cross-source`, `join` + +--- + +## Schema Validations + +### Schema Field Names Validation + +**ID**: `validation.field_names` +**Status**: Stable + +Validate the schema structure by checking field/column names, counts, and ordering. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `names` | array | No | `-` | Expected field names YAML: `validations[].names` | +| `fieldNameType` | enum | No | `-` | Validation type for field names Values: `fieldCountEqual`, `fieldCountBetween`, `fieldNameMatchOrder`, `fieldNameMatchSet` | +| `count` | integer | No | `-` | Expected exact field count | +| `min` | integer | No | `-` | Minimum field count | +| `max` | integer | No | `-` | Maximum field count | + +**Tags**: `validation`, `schema`, `field-names`, `structure` + +--- + +## Wait Conditions + +### Wait Conditions + +**ID**: `validation.wait_condition` +**Status**: Stable + +Define conditions to wait for before running validations. Supports pause, file existence, data existence, and webhook checks. + +**Configuration**: + +| Option | Type | Required | Default | Description | +|--------|------|----------|---------|-------------| +| `type` | enum | No | `-` | Wait condition type Values: `pause`, `fileExists`, `dataExists`, `webhook` YAML: `validations[].waitCondition.type` | +| `pauseInSeconds` | integer | No | `-` | Seconds to pause | +| `path` | string | No | `-` | File path to wait for | +| `url` | string | No | `-` | Webhook URL | +| `method` | enum | No | `-` | HTTP method for webhook Values: `GET`, `POST`, `PUT`, `DELETE` | +| `statusCodes` | array | No | `-` | Expected HTTP status codes | +| `maxRetries` | integer | No | `-` | Maximum retry attempts | +| `waitBeforeRetrySeconds` | integer | No | `-` | Seconds between retries | + +**Tags**: `validation`, `wait`, `condition`, `async` + +--- diff --git a/misc/feature-catalog/docs/comparison-with-lite.md b/misc/feature-catalog/docs/comparison-with-lite.md new file mode 100644 index 00000000..17983132 --- /dev/null +++ b/misc/feature-catalog/docs/comparison-with-lite.md @@ -0,0 +1,344 @@ +# Data Caterer vs Data Caterer Lite — Feature Comparison + +**Data Caterer** (Scala/Spark) vs **Data Caterer Lite** (Go/DuckDB) + +- Full version: 169 features +- Lite version: ~95 features implemented (56% coverage) + +Legend: Y = Implemented | N = Not implemented | P = Partial + +--- + +## Data Source Connectors (16 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | PostgreSQL Connector | `connector.databases.postgres` | Y | Y | | +| 2 | MySQL Connector | `connector.databases.mysql` | Y | Y | | +| 3 | Cassandra Connector | `connector.databases.cassandra` | Y | N | NoSQL; no Go driver integrated | +| 4 | BigQuery Connector | `connector.databases.bigquery` | Y | N | Cloud; no Go SDK integrated | +| 5 | CSV File Connector | `connector.files.csv` | Y | Y | | +| 6 | JSON File Connector | `connector.files.json` | Y | Y | Includes JSONL support | +| 7 | Parquet File Connector | `connector.files.parquet` | Y | Y | | +| 8 | ORC File Connector | `connector.files.orc` | Y | N | | +| 9 | Delta Lake Connector | `connector.files.delta` | Y | Y | With transaction log | +| 10 | Apache Iceberg Connector | `connector.files.iceberg` | Y | Y | Read support | +| 11 | Apache Hudi Connector | `connector.files.hudi` | Y | N | | +| 12 | XML File Connector | `connector.files.xml` | Y | N | | +| 13 | Apache Kafka Connector | `connector.messaging.kafka` | Y | N | | +| 14 | Solace JMS Connector | `connector.messaging.solace` | Y | N | | +| 15 | RabbitMQ Connector | `connector.messaging.rabbitmq` | Y | N | | +| 16 | HTTP/REST API Connector | `connector.http.http` | Y | Y | | + +**Lite total: 8/16 (50%)** + +--- + +## Data Generation — Generator Types (12 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | Regex Pattern Generation | `generation.field.regex` | Y | Y | Parsed to DuckDB SQL | +| 2 | DataFaker Expression | `generation.field.expression` | Y | Y | ~100 faker expressions mapped to DuckDB SQL | +| 3 | One-Of Selection | `generation.field.one_of` | Y | Y | oneOf and oneOfWeighted | +| 4 | SQL Expression | `generation.field.sql` | Y | Y | DuckDB SQL instead of Spark SQL | +| 5 | Static Value | `generation.field.static` | Y | Y | | +| 6 | UUID Generation | `generation.field.uuid` | Y | Y | | +| 7 | Sequential Value Generation | `generation.field.sequence` | Y | Y | With prefix and padding | +| 8 | Conditional Value Generation | `generation.field.conditional_value` | Y | N | CASE WHEN logic | +| 9 | Correlated Field Generation | `generation.field.correlated` | Y | N | Statistical correlation | +| 10 | Value Mapping | `generation.field.mapping` | Y | N | Source field to output mapping | +| 11 | Semantic Version Generation | `generation.field.semantic_version` | Y | N | | +| 12 | Daily Batch Sequence | `generation.field.daily_batch_sequence` | Y | N | | + +**Lite total: 7/12 (58%)** + +--- + +## Data Generation — Data Types (13 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | String Type | `generation.type.string` | Y | Y | | +| 2 | Integer Type | `generation.type.integer` | Y | Y | | +| 3 | Long Type | `generation.type.long` | Y | Y | | +| 4 | Double Type | `generation.type.double` | Y | Y | | +| 5 | Float Type | `generation.type.float` | Y | Y | | +| 6 | Decimal Type | `generation.type.decimal` | Y | Y | | +| 7 | Boolean Type | `generation.type.boolean` | Y | Y | | +| 8 | Date Type | `generation.type.date` | Y | Y | | +| 9 | Timestamp Type | `generation.type.timestamp` | Y | Y | | +| 10 | Binary Type | `generation.type.binary` | Y | Y | | +| 11 | Array Type | `generation.type.array` | Y | Y | | +| 12 | Struct Type | `generation.type.struct` | Y | Y | | +| 13 | Map Type | `generation.type.map` | Y | Y | | + +**Lite total: 13/13 (100%)** + +--- + +## Data Generation — Field Options (17 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | Numeric Range | `generation.option.numeric_range` | Y | Y | min/max | +| 2 | Date/Time Range | `generation.option.date_range` | Y | Y | Includes withinDays, futureDays, excludeWeekends | +| 3 | Null Value Control | `generation.option.null_handling` | Y | Y | enableNull, nullProb | +| 4 | Edge Case Generation | `generation.option.edge_cases` | Y | N | | +| 5 | String Length Control | `generation.option.string_length` | Y | P | Via regex or SQL; no explicit minLen/maxLen/avgLen options | +| 6 | Array Configuration | `generation.option.array_config` | Y | P | Basic array support; no arrayWeightedOneOf, arrayUniqueFrom | +| 7 | Map Configuration | `generation.option.map_config` | Y | P | Basic map support; no explicit mapMinSize/mapMaxSize | +| 8 | Value Distribution | `generation.option.distribution` | Y | N | uniform/normal/exponential distributions | +| 9 | Uniqueness Constraint | `generation.option.uniqueness` | Y | Y | isUnique, isPrimaryKey | +| 10 | Numeric Precision and Scale | `generation.option.numeric_precision` | Y | P | Supported via DuckDB types; no explicit precision/scale/round options | +| 11 | Field Omission | `generation.option.omit` | Y | N | | +| 12 | Random Seed | `generation.option.seed` | Y | N | Per-field seed | +| 13 | Distinct Value Count | `generation.option.distinct_count` | Y | N | Metadata-driven | +| 14 | Cassandra Key Configuration | `generation.option.cassandra_keys` | Y | N | Cassandra-specific | +| 15 | Incremental Generation | `generation.option.incremental` | Y | N | | +| 16 | HTTP Parameter Type | `generation.option.http_param_type` | Y | Y | path/query/header | +| 17 | Post-SQL Expression | `generation.option.post_sql_expression` | Y | N | | + +**Lite total: 6/17 (35%), plus 4 partial** + +--- + +## Data Generation — Field Labels (13 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | Name Label | `generation.label.name` | Y | N | Auto-detection labels not implemented | +| 2 | Username Label | `generation.label.username` | Y | N | | +| 3 | Address Label | `generation.label.address` | Y | N | | +| 4 | Application Label | `generation.label.app` | Y | N | | +| 5 | Nation Label | `generation.label.nation` | Y | N | | +| 6 | Money Label | `generation.label.money` | Y | N | | +| 7 | Internet Label | `generation.label.internet` | Y | N | | +| 8 | Food Label | `generation.label.food` | Y | N | | +| 9 | Job Label | `generation.label.job` | Y | N | | +| 10 | Relationship Label | `generation.label.relationship` | Y | N | | +| 11 | Weather Label | `generation.label.weather` | Y | N | | +| 12 | Phone Label | `generation.label.phone` | Y | N | | +| 13 | Geo Label | `generation.label.geo` | Y | N | | + +**Lite total: 0/13 (0%)** — Labels are a metadata-driven auto-detection feature not in Lite + +--- + +## Data Validation (42 features) + +### Field-Level Validations (29 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | Null Check | `validation.field.null` | Y | Y | notNull | +| 2 | Unique Values | `validation.field.unique` | Y | Y | | +| 3 | Equality Check | `validation.field.equal` | Y | N | | +| 4 | Contains Check | `validation.field.contains` | Y | N | | +| 5 | Starts With | `validation.field.starts_with` | Y | N | | +| 6 | Ends With | `validation.field.ends_with` | Y | N | | +| 7 | Less Than | `validation.field.less_than` | Y | N | | +| 8 | Greater Than | `validation.field.greater_than` | Y | N | | +| 9 | Between Range | `validation.field.between` | Y | Y | | +| 10 | In Set | `validation.field.in` | Y | Y | Also notIn | +| 11 | Regex Match | `validation.field.matches` | Y | Y | matches | +| 12 | Regex Match List | `validation.field.matches_list` | Y | N | | +| 13 | Size Check | `validation.field.size` | Y | N | | +| 14 | Less Than Size | `validation.field.less_than_size` | Y | N | | +| 15 | Greater Than Size | `validation.field.greater_than_size` | Y | N | | +| 16 | Length Between | `validation.field.length_between` | Y | P | Via minLength/maxLength options | +| 17 | Length Equal | `validation.field.length_equal` | Y | N | | +| 18 | Luhn Check | `validation.field.luhn_check` | Y | N | | +| 19 | Type Check | `validation.field.has_type` | Y | N | | +| 20 | Multi-Type Check | `validation.field.has_types` | Y | N | | +| 21 | Monotonically Decreasing | `validation.field.is_decreasing` | Y | N | | +| 22 | Monotonically Increasing | `validation.field.is_increasing` | Y | N | | +| 23 | JSON Parsable | `validation.field.is_json_parsable` | Y | N | | +| 24 | JSON Schema Match | `validation.field.match_json_schema` | Y | N | | +| 25 | DateTime Format Match | `validation.field.match_date_time_format` | Y | N | | +| 26 | Distinct In Set | `validation.field.distinct_in_set` | Y | N | | +| 27 | Distinct Contains Set | `validation.field.distinct_contains_set` | Y | N | | +| 28 | Distinct Equal | `validation.field.distinct_equal` | Y | N | | +| 29 | Most Common Value | `validation.field.most_common_value_in_set` | Y | N | | + +### Statistical Validations (8 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | Max Between | `validation.statistical.max_between` | Y | N | | +| 2 | Mean Between | `validation.statistical.mean_between` | Y | N | | +| 3 | Median Between | `validation.statistical.median_between` | Y | N | | +| 4 | Min Between | `validation.statistical.min_between` | Y | N | | +| 5 | Std Dev Between | `validation.statistical.std_dev_between` | Y | N | | +| 6 | Sum Between | `validation.statistical.sum_between` | Y | N | | +| 7 | Unique Values Proportion | `validation.statistical.unique_values_proportion_between` | Y | N | | +| 8 | Quantile Values Between | `validation.statistical.quantile_values_between` | Y | N | | + +### Other Validations (5 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | SQL Expression Validation | `validation.expression` | Y | Y | expr-based | +| 2 | Group By Aggregation Validation | `validation.group_by` | Y | Y | sum, avg, min, max, count, stddev | +| 3 | Upstream Cross-Source Validation | `validation.upstream` | Y | N | Cross-system joins | +| 4 | Schema Field Names Validation | `validation.field_names` | Y | N | | +| 5 | Wait Conditions | `validation.wait_condition` | Y | N | pause/fileExists/dataExists/webhook | + +**Lite total: 8/42 (19%), plus 1 partial** + +--- + +## Configuration (29 features) + +### Feature Flags (14 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | Count Records | `configuration.flags.enablecount` | Y | Y | | +| 2 | Generate Data | `configuration.flags.enablegeneratedata` | Y | Y | | +| 3 | Record Tracking | `configuration.flags.enablerecordtracking` | Y | Y | | +| 4 | Delete Generated Records | `configuration.flags.enabledeletegeneratedrecords` | Y | Y | | +| 5 | Auto-Generate Plan and Tasks | `configuration.flags.enablegenerateplanandtasks` | Y | Y | | +| 6 | Fail on Error | `configuration.flags.enablefailonerror` | Y | Y | | +| 7 | Unique Check | `configuration.flags.enableuniquecheck` | Y | Y | | +| 8 | Sink Metadata | `configuration.flags.enablesinkmetadata` | Y | Y | | +| 9 | Save Reports | `configuration.flags.enablesavereports` | Y | Y | | +| 10 | Data Validation | `configuration.flags.enablevalidation` | Y | Y | | +| 11 | Suggest Validations | `configuration.flags.enablegeneratevalidations` | Y | Y | | +| 12 | Alerts | `configuration.flags.enablealerts` | Y | Y | | +| 13 | Unique Check Only In Batch | `configuration.flags.enableuniquecheckonlyinbatch` | Y | N | | +| 14 | Fast Generation | `configuration.flags.enablefastgeneration` | Y | N | Always fast in Lite (DuckDB SQL-native) | + +### Folder Paths (7 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | plan File | `configuration.folders.planfilepath` | Y | Y | Via YAML_FILE env var or CLI arg | +| 2 | task Folder | `configuration.folders.taskfolderpath` | Y | N | Legacy format | +| 3 | generatedPlanAndTask Folder | `configuration.folders.generatedplanandtaskfolderpath` | Y | N | | +| 4 | generatedReports Folder | `configuration.folders.generatedreportsfolderpath` | Y | Y | | +| 5 | recordTracking Folder | `configuration.folders.recordtrackingfolderpath` | Y | N | | +| 6 | validation Folder | `configuration.folders.validationfolderpath` | Y | N | | +| 7 | recordTrackingForValidation Folder | `configuration.folders.recordtrackingforvalidationfolderpath` | Y | N | | + +### Other Configuration (8 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | Batch Size | `configuration.generation.batch_size` | Y | P | Has maxRecordsPerChunk instead | +| 2 | Bloom Filter Configuration | `configuration.generation.bloom_filter` | Y | N | | +| 3 | Metadata Analysis Configuration | `configuration.metadata` | Y | N | | +| 4 | Streaming Configuration | `configuration.streaming` | Y | P | Different streaming model | +| 5 | Alert Configuration | `configuration.alerts` | Y | N | Flag exists but Slack integration not implemented | +| 6 | Validation Runtime Configuration | `configuration.validation` | Y | N | | +| 7 | Apache Spark Configuration | `configuration.runtime.spark` | Y | N | Not applicable (DuckDB engine) | +| 8 | Global Sink Options | `configuration.sink_options` | Y | Y | seed, locale | + +**Lite total: 16/29 (55%), plus 2 partial** + +--- + +## Advanced Features (11 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | Foreign Key Relationships | `advanced.foreign_keys` | Y | Y | Topological sort, referential integrity | +| 2 | Foreign Key Cardinality Control | `advanced.foreign_key_cardinality` | Y | Y | min/max per parent | +| 3 | Foreign Key Nullability | `advanced.foreign_key_nullability` | Y | Y | nullPercentage | +| 4 | Foreign Key Generation Modes | `advanced.foreign_key_generation_modes` | Y | N | all-exist/all-combinations/partial | +| 5 | Record Count Configuration | `advanced.count` | Y | Y | Fixed count, per-field | +| 6 | Streaming Load Patterns | `advanced.streaming_load_patterns` | Y | Y | ramp, spike, sine, steps | +| 7 | Post-Generation Transformation | `advanced.transformation` | Y | N | Custom Java/Scala classes | +| 8 | Step Field Filtering | `advanced.step_options` | Y | N | include/excludeFields | +| 9 | Reference Mode | `advanced.reference_mode` | Y | N | | +| 10 | Configuration Interfaces | `advanced.interfaces` | Y | P | CLI + YAML only (no Java/Scala API, no Web UI) | +| 11 | Environment Variable Substitution | `advanced.env_substitution` | Y | Y | ${VAR} and ${VAR:-default} | + +**Lite total: 7/11 (64%), plus 1 partial** + +--- + +## Metadata Integration (10 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | Marquez Integration | `metadata.source.marquez` | Y | N | | +| 2 | OpenMetadata Integration | `metadata.source.open_metadata` | Y | N | | +| 3 | OpenAPI/Swagger Integration | `metadata.source.open_api` | Y | Y | Generates HTTP steps | +| 4 | Great Expectations Integration | `metadata.source.great_expectations` | Y | N | | +| 5 | Open Data Contract Standard Integration | `metadata.source.open_data_contract_standard` | Y | Y | ODCS | +| 6 | Data Contract CLI Integration | `metadata.source.data_contract_cli` | Y | N | | +| 7 | Amundsen Integration | `metadata.source.amundsen` | Y | N | | +| 8 | DataHub Integration | `metadata.source.datahub` | Y | N | | +| 9 | Confluent Schema Registry Integration | `metadata.source.confluent_schema_registry` | Y | N | | +| 10 | JSON Schema Integration | `metadata.source.json_schema` | Y | Y | With allOf/anyOf/oneOf composition | + +**Lite total: 3/10 (30%)** + +Note: Lite also supports **dbt manifest.json** and **CSV/JSON/Parquet profiling** as metadata extraction sources, which are not in the Full version's feature catalog. + +--- + +## UI and API (6 features) + +| # | Feature | ID | Full | Lite | Notes | +|---|---------|-----|------|------|-------| +| 1 | Connection Management | `ui.connection_management` | Y | N | Web UI only | +| 2 | Interactive Plan Creation | `ui.plan_creation` | Y | N | Web UI only | +| 3 | Execution History | `ui.execution_history` | Y | N | Web UI only | +| 4 | Real-time Results | `ui.results_viewing` | Y | N | Web UI only | +| 5 | Sample Data Generation | `ui.sample_data` | Y | Y | Via `sample` CLI command | +| 6 | Report Generation | `ui.report_generation` | Y | Y | HTML reports | + +**Lite total: 2/6 (33%)** + +--- + +## Summary by Category + +| Category | Full | Lite | Lite % | +|----------|------|------|--------| +| Connectors | 16 | 8 | 50% | +| Generation — Generator Types | 12 | 7 | 58% | +| Generation — Data Types | 13 | 13 | 100% | +| Generation — Field Options | 17 | 6 (+4P) | 35% | +| Generation — Field Labels | 13 | 0 | 0% | +| Validation | 42 | 8 (+1P) | 19% | +| Configuration | 29 | 16 (+2P) | 55% | +| Advanced Features | 11 | 7 (+1P) | 64% | +| Metadata Integration | 10 | 3 | 30% | +| UI and API | 6 | 2 | 33% | +| **Total** | **169** | **70 (+8P)** | **41% (46% incl. partial)** | + +--- + +## Lite-Only Features (Not in Full) + +These capabilities exist in Data Caterer Lite but are NOT listed in the Full version's feature catalog: + +| Feature | Description | +|---------|-------------| +| MCP Server | Model Context Protocol server for LLM integration (6+ tools) | +| CLI Commands | `generate`, `validate`, `extract`, `init`, `list`, `sample`, `mcp` | +| dbt Integration | Extract metadata from dbt manifest.json | +| File Profiling | Auto-detect schemas from CSV/JSON/Parquet files | +| Resource Limits | maxRecordsPerStep, maxTotalRecords, maxFileSizeMB, maxMemoryMB | +| Single Binary | ~64MB cross-platform native binary (no JVM required) | +| DuckDB Engine | All generation done via native SQL (always "fast mode") | + +--- + +## Biggest Gaps in Lite + +These are the highest-impact missing features, ordered by likely user demand: + +1. **Messaging connectors** — Kafka, RabbitMQ, Solace (0/3) +2. **Field labels / auto-detection** — All 13 labels missing +3. **Validation coverage** — Only 8/42 validation types (missing statistical, schema, cross-source) +4. **Database connectors** — Cassandra, BigQuery (0/2) +5. **File format connectors** — ORC, Hudi, XML (0/3) +6. **Advanced generation** — Conditional values, correlated fields, value mapping, edge cases +7. **Web UI** — No visual plan creation or execution management (0/4 UI features) +8. **Metadata integrations** — 7/10 sources missing (Marquez, OpenMetadata, Great Expectations, etc.) +9. **Post-generation transformation** — Custom class-based transforms +10. **Foreign key generation modes** — all-combinations and partial modes diff --git a/misc/feature-catalog/docs/index.md b/misc/feature-catalog/docs/index.md new file mode 100644 index 00000000..e7a52302 --- /dev/null +++ b/misc/feature-catalog/docs/index.md @@ -0,0 +1,217 @@ +# Data Caterer Feature Catalog + +Complete reference for all features in Data Caterer. + +- **Version**: 0.19.0 +- **Total Features**: 169 +- **Last Updated**: 2026-02-11 +- **Repository**: https://github.com/data-catering/data-caterer + +## Categories + +### [Data Source Connectors](categories/connectors.md) +16 features | Data Caterer supports connecting to databases, file systems, messaging systems, and HTTP APIs for reading and writing test data. + +### [Data Generation](categories/generation.md) +55 features | Comprehensive data generation capabilities including regex patterns, faker expressions, SQL computations, and field-level configuration options. + +### [Data Validation](categories/validation.md) +42 features | Over 30 validation types for verifying generated data quality, schema compliance, statistical properties, and cross-source consistency. + +### [Configuration](categories/configuration.md) +29 features | Runtime configuration for controlling generation behavior, validation, performance tuning, alerts, and output paths. + +### [Advanced Features](categories/advanced.md) +11 features | Foreign key relationships, streaming load patterns, custom transformations, metadata-driven generation, and more. + +### [Metadata Integration](categories/metadata.md) +10 features | Import schemas and metadata from external catalogs, registries, and standards to drive automatic data generation. + +### [UI and API](categories/ui-api.md) +6 features | Web-based user interface for visual plan creation, execution management, and result viewing. + +## Feature Summary + +| Category | Features | Description | +|----------|----------|-------------| +| [Data Source Connectors](categories/connectors.md) | 16 | Data Caterer supports connecting to databases, file systems, messaging systems, ... | +| [Data Generation](categories/generation.md) | 55 | Comprehensive data generation capabilities including regex patterns, faker expre... | +| [Data Validation](categories/validation.md) | 42 | Over 30 validation types for verifying generated data quality, schema compliance... | +| [Configuration](categories/configuration.md) | 29 | Runtime configuration for controlling generation behavior, validation, performan... | +| [Advanced Features](categories/advanced.md) | 11 | Foreign key relationships, streaming load patterns, custom transformations, meta... | +| [Metadata Integration](categories/metadata.md) | 10 | Import schemas and metadata from external catalogs, registries, and standards to... | +| [UI and API](categories/ui-api.md) | 6 | Web-based user interface for visual plan creation, execution management, and res... | + +**Total: 169 features** + +## All Features (Alphabetical) + +- [Address Label](categories/generation.md#address-label) - `generation.label.address` +- [Alert Configuration](categories/configuration.md#alert-configuration) - `configuration.alerts` +- [Alerts](categories/configuration.md#alerts) - `configuration.flags.enablealerts` +- [Amundsen Integration](categories/metadata.md#amundsen-integration) - `metadata.source.amundsen` +- [Apache Hudi Connector](categories/connectors.md#apache-hudi-connector) - `connector.files.hudi` +- [Apache Iceberg Connector](categories/connectors.md#apache-iceberg-connector) - `connector.files.iceberg` +- [Apache Kafka Connector](categories/connectors.md#apache-kafka-connector) - `connector.messaging.kafka` +- [Apache Spark Configuration](categories/configuration.md#apache-spark-configuration) - `configuration.runtime.spark` +- [Application Label](categories/generation.md#application-label) - `generation.label.app` +- [Array Configuration](categories/generation.md#array-configuration) - `generation.option.array_config` +- [Array Type](categories/generation.md#array-type) - `generation.type.array` +- [Auto-Generate Plan and Tasks](categories/configuration.md#auto-generate-plan-and-tasks) - `configuration.flags.enablegenerateplanandtasks` +- [Batch Size](categories/configuration.md#batch-size) - `configuration.generation.batch_size` +- [Between Range](categories/validation.md#between-range) - `validation.field.between` +- [BigQuery Connector](categories/connectors.md#bigquery-connector) - `connector.databases.bigquery` +- [Binary Type](categories/generation.md#binary-type) - `generation.type.binary` +- [Bloom Filter Configuration](categories/configuration.md#bloom-filter-configuration) - `configuration.generation.bloom_filter` +- [Boolean Type](categories/generation.md#boolean-type) - `generation.type.boolean` +- [Cassandra Connector](categories/connectors.md#cassandra-connector) - `connector.databases.cassandra` +- [Cassandra Key Configuration](categories/generation.md#cassandra-key-configuration) - `generation.option.cassandra_keys` +- [Conditional Value Generation](categories/generation.md#conditional-value-generation) - `generation.field.conditional_value` +- [Configuration Interfaces](categories/advanced.md#configuration-interfaces) - `advanced.interfaces` +- [Confluent Schema Registry Integration](categories/metadata.md#confluent-schema-registry-integration) - `metadata.source.confluent_schema_registry` +- [Connection Management](categories/ui-api.md#connection-management) - `ui.connection_management` +- [Contains Check](categories/validation.md#contains-check) - `validation.field.contains` +- [Correlated Field Generation](categories/generation.md#correlated-field-generation) - `generation.field.correlated` +- [Count Records](categories/configuration.md#count-records) - `configuration.flags.enablecount` +- [CSV File Connector](categories/connectors.md#csv-file-connector) - `connector.files.csv` +- [Daily Batch Sequence](categories/generation.md#daily-batch-sequence) - `generation.field.daily_batch_sequence` +- [Data Contract CLI Integration](categories/metadata.md#data-contract-cli-integration) - `metadata.source.data_contract_cli` +- [Data Validation](categories/configuration.md#data-validation) - `configuration.flags.enablevalidation` +- [DataFaker Expression](categories/generation.md#datafaker-expression) - `generation.field.expression` +- [DataHub Integration](categories/metadata.md#datahub-integration) - `metadata.source.datahub` +- [Date Type](categories/generation.md#date-type) - `generation.type.date` +- [Date/Time Range](categories/generation.md#date-time-range) - `generation.option.date_range` +- [DateTime Format Match](categories/validation.md#datetime-format-match) - `validation.field.match_date_time_format` +- [Decimal Type](categories/generation.md#decimal-type) - `generation.type.decimal` +- [Delete Generated Records](categories/configuration.md#delete-generated-records) - `configuration.flags.enabledeletegeneratedrecords` +- [Delta Lake Connector](categories/connectors.md#delta-lake-connector) - `connector.files.delta` +- [Distinct Contains Set](categories/validation.md#distinct-contains-set) - `validation.field.distinct_contains_set` +- [Distinct Equal](categories/validation.md#distinct-equal) - `validation.field.distinct_equal` +- [Distinct In Set](categories/validation.md#distinct-in-set) - `validation.field.distinct_in_set` +- [Distinct Value Count](categories/generation.md#distinct-value-count) - `generation.option.distinct_count` +- [Double Type](categories/generation.md#double-type) - `generation.type.double` +- [Edge Case Generation](categories/generation.md#edge-case-generation) - `generation.option.edge_cases` +- [Ends With](categories/validation.md#ends-with) - `validation.field.ends_with` +- [Environment Variable Substitution](categories/advanced.md#environment-variable-substitution) - `advanced.env_substitution` +- [Equality Check](categories/validation.md#equality-check) - `validation.field.equal` +- [Execution History](categories/ui-api.md#execution-history) - `ui.execution_history` +- [Fail on Error](categories/configuration.md#fail-on-error) - `configuration.flags.enablefailonerror` +- [Fast Generation](categories/configuration.md#fast-generation) - `configuration.flags.enablefastgeneration` +- [Field Omission](categories/generation.md#field-omission) - `generation.option.omit` +- [Float Type](categories/generation.md#float-type) - `generation.type.float` +- [Food Label](categories/generation.md#food-label) - `generation.label.food` +- [Foreign Key Cardinality Control](categories/advanced.md#foreign-key-cardinality-control) - `advanced.foreign_key_cardinality` +- [Foreign Key Generation Modes](categories/advanced.md#foreign-key-generation-modes) - `advanced.foreign_key_generation_modes` +- [Foreign Key Nullability](categories/advanced.md#foreign-key-nullability) - `advanced.foreign_key_nullability` +- [Foreign Key Relationships](categories/advanced.md#foreign-key-relationships) - `advanced.foreign_keys` +- [Generate Data](categories/configuration.md#generate-data) - `configuration.flags.enablegeneratedata` +- [generatedPlanAndTask Folder](categories/configuration.md#generatedplanandtask-folder) - `configuration.folders.generatedplanandtaskfolderpath` +- [generatedReports Folder](categories/configuration.md#generatedreports-folder) - `configuration.folders.generatedreportsfolderpath` +- [Geo Label](categories/generation.md#geo-label) - `generation.label.geo` +- [Global Sink Options](categories/configuration.md#global-sink-options) - `configuration.sink_options` +- [Great Expectations Integration](categories/metadata.md#great-expectations-integration) - `metadata.source.great_expectations` +- [Greater Than](categories/validation.md#greater-than) - `validation.field.greater_than` +- [Greater Than Size](categories/validation.md#greater-than-size) - `validation.field.greater_than_size` +- [Group By Aggregation Validation](categories/validation.md#group-by-aggregation-validation) - `validation.group_by` +- [HTTP Parameter Type](categories/generation.md#http-parameter-type) - `generation.option.http_param_type` +- [HTTP/REST API Connector](categories/connectors.md#http-rest-api-connector) - `connector.http.http` +- [In Set](categories/validation.md#in-set) - `validation.field.in` +- [Incremental Generation](categories/generation.md#incremental-generation) - `generation.option.incremental` +- [Integer Type](categories/generation.md#integer-type) - `generation.type.integer` +- [Interactive Plan Creation](categories/ui-api.md#interactive-plan-creation) - `ui.plan_creation` +- [Internet Label](categories/generation.md#internet-label) - `generation.label.internet` +- [Job Label](categories/generation.md#job-label) - `generation.label.job` +- [JSON File Connector](categories/connectors.md#json-file-connector) - `connector.files.json` +- [JSON Parsable](categories/validation.md#json-parsable) - `validation.field.is_json_parsable` +- [JSON Schema Integration](categories/metadata.md#json-schema-integration) - `metadata.source.json_schema` +- [JSON Schema Match](categories/validation.md#json-schema-match) - `validation.field.match_json_schema` +- [Length Between](categories/validation.md#length-between) - `validation.field.length_between` +- [Length Equal](categories/validation.md#length-equal) - `validation.field.length_equal` +- [Less Than](categories/validation.md#less-than) - `validation.field.less_than` +- [Less Than Size](categories/validation.md#less-than-size) - `validation.field.less_than_size` +- [Long Type](categories/generation.md#long-type) - `generation.type.long` +- [Luhn Check](categories/validation.md#luhn-check) - `validation.field.luhn_check` +- [Map Configuration](categories/generation.md#map-configuration) - `generation.option.map_config` +- [Map Type](categories/generation.md#map-type) - `generation.type.map` +- [Marquez Integration](categories/metadata.md#marquez-integration) - `metadata.source.marquez` +- [Max Between](categories/validation.md#max-between) - `validation.statistical.max_between` +- [Mean Between](categories/validation.md#mean-between) - `validation.statistical.mean_between` +- [Median Between](categories/validation.md#median-between) - `validation.statistical.median_between` +- [Metadata Analysis Configuration](categories/configuration.md#metadata-analysis-configuration) - `configuration.metadata` +- [Min Between](categories/validation.md#min-between) - `validation.statistical.min_between` +- [Money Label](categories/generation.md#money-label) - `generation.label.money` +- [Monotonically Decreasing](categories/validation.md#monotonically-decreasing) - `validation.field.is_decreasing` +- [Monotonically Increasing](categories/validation.md#monotonically-increasing) - `validation.field.is_increasing` +- [Most Common Value](categories/validation.md#most-common-value) - `validation.field.most_common_value_in_set` +- [Multi-Type Check](categories/validation.md#multi-type-check) - `validation.field.has_types` +- [MySQL Connector](categories/connectors.md#mysql-connector) - `connector.databases.mysql` +- [Name Label](categories/generation.md#name-label) - `generation.label.name` +- [Nation Label](categories/generation.md#nation-label) - `generation.label.nation` +- [Null Check](categories/validation.md#null-check) - `validation.field.null` +- [Null Value Control](categories/generation.md#null-value-control) - `generation.option.null_handling` +- [Numeric Precision and Scale](categories/generation.md#numeric-precision-and-scale) - `generation.option.numeric_precision` +- [Numeric Range](categories/generation.md#numeric-range) - `generation.option.numeric_range` +- [One-Of Selection](categories/generation.md#one-of-selection) - `generation.field.one_of` +- [Open Data Contract Standard Integration](categories/metadata.md#open-data-contract-standard-integration) - `metadata.source.open_data_contract_standard` +- [OpenAPI/Swagger Integration](categories/metadata.md#openapi-swagger-integration) - `metadata.source.open_api` +- [OpenMetadata Integration](categories/metadata.md#openmetadata-integration) - `metadata.source.open_metadata` +- [ORC File Connector](categories/connectors.md#orc-file-connector) - `connector.files.orc` +- [Parquet File Connector](categories/connectors.md#parquet-file-connector) - `connector.files.parquet` +- [Phone Label](categories/generation.md#phone-label) - `generation.label.phone` +- [plan File](categories/configuration.md#plan-file) - `configuration.folders.planfilepath` +- [Post-Generation Transformation](categories/advanced.md#post-generation-transformation) - `advanced.transformation` +- [Post-SQL Expression](categories/generation.md#post-sql-expression) - `generation.option.post_sql_expression` +- [PostgreSQL Connector](categories/connectors.md#postgresql-connector) - `connector.databases.postgres` +- [Quantile Values Between](categories/validation.md#quantile-values-between) - `validation.statistical.quantile_values_between` +- [RabbitMQ Connector](categories/connectors.md#rabbitmq-connector) - `connector.messaging.rabbitmq` +- [Random Seed](categories/generation.md#random-seed) - `generation.option.seed` +- [Real-time Results](categories/ui-api.md#real-time-results) - `ui.results_viewing` +- [Record Count Configuration](categories/advanced.md#record-count-configuration) - `advanced.count` +- [Record Tracking](categories/configuration.md#record-tracking) - `configuration.flags.enablerecordtracking` +- [recordTracking Folder](categories/configuration.md#recordtracking-folder) - `configuration.folders.recordtrackingfolderpath` +- [recordTrackingForValidation Folder](categories/configuration.md#recordtrackingforvalidation-folder) - `configuration.folders.recordtrackingforvalidationfolderpath` +- [Reference Mode](categories/advanced.md#reference-mode) - `advanced.reference_mode` +- [Regex Match](categories/validation.md#regex-match) - `validation.field.matches` +- [Regex Match List](categories/validation.md#regex-match-list) - `validation.field.matches_list` +- [Regex Pattern Generation](categories/generation.md#regex-pattern-generation) - `generation.field.regex` +- [Relationship Label](categories/generation.md#relationship-label) - `generation.label.relationship` +- [Report Generation](categories/ui-api.md#report-generation) - `ui.report_generation` +- [Sample Data Generation](categories/ui-api.md#sample-data-generation) - `ui.sample_data` +- [Save Reports](categories/configuration.md#save-reports) - `configuration.flags.enablesavereports` +- [Schema Field Names Validation](categories/validation.md#schema-field-names-validation) - `validation.field_names` +- [Semantic Version Generation](categories/generation.md#semantic-version-generation) - `generation.field.semantic_version` +- [Sequential Value Generation](categories/generation.md#sequential-value-generation) - `generation.field.sequence` +- [Sink Metadata](categories/configuration.md#sink-metadata) - `configuration.flags.enablesinkmetadata` +- [Size Check](categories/validation.md#size-check) - `validation.field.size` +- [Solace JMS Connector](categories/connectors.md#solace-jms-connector) - `connector.messaging.solace` +- [SQL Expression](categories/generation.md#sql-expression) - `generation.field.sql` +- [SQL Expression Validation](categories/validation.md#sql-expression-validation) - `validation.expression` +- [Starts With](categories/validation.md#starts-with) - `validation.field.starts_with` +- [Static Value](categories/generation.md#static-value) - `generation.field.static` +- [Std Dev Between](categories/validation.md#std-dev-between) - `validation.statistical.std_dev_between` +- [Step Field Filtering](categories/advanced.md#step-field-filtering) - `advanced.step_options` +- [Streaming Configuration](categories/configuration.md#streaming-configuration) - `configuration.streaming` +- [Streaming Load Patterns](categories/advanced.md#streaming-load-patterns) - `advanced.streaming_load_patterns` +- [String Length Control](categories/generation.md#string-length-control) - `generation.option.string_length` +- [String Type](categories/generation.md#string-type) - `generation.type.string` +- [Struct Type](categories/generation.md#struct-type) - `generation.type.struct` +- [Suggest Validations](categories/configuration.md#suggest-validations) - `configuration.flags.enablegeneratevalidations` +- [Sum Between](categories/validation.md#sum-between) - `validation.statistical.sum_between` +- [task Folder](categories/configuration.md#task-folder) - `configuration.folders.taskfolderpath` +- [Timestamp Type](categories/generation.md#timestamp-type) - `generation.type.timestamp` +- [Type Check](categories/validation.md#type-check) - `validation.field.has_type` +- [Unique Check](categories/configuration.md#unique-check) - `configuration.flags.enableuniquecheck` +- [Unique Check Only In Batch](categories/configuration.md#unique-check-only-in-batch) - `configuration.flags.enableuniquecheckonlyinbatch` +- [Unique Values](categories/validation.md#unique-values) - `validation.field.unique` +- [Unique Values Proportion](categories/validation.md#unique-values-proportion) - `validation.statistical.unique_values_proportion_between` +- [Uniqueness Constraint](categories/generation.md#uniqueness-constraint) - `generation.option.uniqueness` +- [Upstream Cross-Source Validation](categories/validation.md#upstream-cross-source-validation) - `validation.upstream` +- [Username Label](categories/generation.md#username-label) - `generation.label.username` +- [UUID Generation](categories/generation.md#uuid-generation) - `generation.field.uuid` +- [validation Folder](categories/configuration.md#validation-folder) - `configuration.folders.validationfolderpath` +- [Validation Runtime Configuration](categories/configuration.md#validation-runtime-configuration) - `configuration.validation` +- [Value Distribution](categories/generation.md#value-distribution) - `generation.option.distribution` +- [Value Mapping](categories/generation.md#value-mapping) - `generation.field.mapping` +- [Wait Conditions](categories/validation.md#wait-conditions) - `validation.wait_condition` +- [Weather Label](categories/generation.md#weather-label) - `generation.label.weather` +- [XML File Connector](categories/connectors.md#xml-file-connector) - `connector.files.xml` diff --git a/misc/feature-catalog/features.json b/misc/feature-catalog/features.json new file mode 100644 index 00000000..5f7faf23 --- /dev/null +++ b/misc/feature-catalog/features.json @@ -0,0 +1,6168 @@ +{ + "project": { + "name": "Data Caterer", + "version": "0.19.0", + "repository": "https://github.com/data-catering/data-caterer", + "lastUpdated": "2026-02-11" + }, + "categories": [ + { + "id": "advanced", + "name": "Advanced", + "featureCount": 11 + }, + { + "id": "configuration", + "name": "Configuration", + "featureCount": 29 + }, + { + "id": "connectors", + "name": "Connectors", + "featureCount": 16 + }, + { + "id": "generation", + "name": "Generation", + "featureCount": 55 + }, + { + "id": "metadata", + "name": "Metadata", + "featureCount": 10 + }, + { + "id": "ui_api", + "name": "Ui Api", + "featureCount": 6 + }, + { + "id": "validation", + "name": "Validation", + "featureCount": 42 + } + ], + "features": [ + { + "id": "connector.databases.postgres", + "name": "PostgreSQL Connector", + "category": "connectors", + "status": "stable", + "description": "Connect to PostgreSQL databases for reading and writing data. Supports table-level configuration, custom queries, and JDBC options.", + "subcategory": "databases", + "configuration": [ + { + "name": "url", + "type": "string", + "description": "JDBC connection URL", + "required": true, + "scope": "datasource", + "yamlPath": "dataSources[].connection.options.url" + }, + { + "name": "user", + "type": "string", + "description": "Database username", + "required": false, + "scope": "datasource", + "yamlPath": "dataSources[].connection.options.user" + }, + { + "name": "password", + "type": "string", + "description": "Database password", + "required": false, + "scope": "datasource", + "yamlPath": "dataSources[].connection.options.password" + }, + { + "name": "driver", + "type": "string", + "description": "JDBC driver class", + "required": false, + "scope": "datasource", + "default": "org.postgresql.Driver" + }, + { + "name": "dbtable", + "type": "string", + "description": "Target table (schema.table)", + "required": false, + "scope": "step", + "yamlPath": "dataSources[].steps[].options.dbtable" + }, + { + "name": "query", + "type": "string", + "description": "Custom SQL query for reading", + "required": false, + "scope": "step" + } + ], + "examples": [ + { + "format": "yaml", + "code": "dataSources:\n - name: my_postgres\n connection:\n type: postgres\n options:\n url: \"jdbc:postgresql://localhost:5432/mydb\"\n user: \"postgres\"\n password: \"${POSTGRES_PASSWORD}\"\n steps:\n - name: customers\n options:\n dbtable: \"public.customers\"\n count:\n records: 1000", + "title": "PostgreSQL data generation" + } + ], + "tags": [ + "database", + "jdbc", + "relational", + "sql" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.databases.mysql", + "name": "MySQL Connector", + "category": "connectors", + "status": "stable", + "description": "Connect to MySQL databases for reading and writing data. Supports table-level configuration and JDBC options.", + "subcategory": "databases", + "configuration": [ + { + "name": "url", + "type": "string", + "description": "JDBC connection URL", + "required": true, + "scope": "datasource" + }, + { + "name": "user", + "type": "string", + "description": "Database username", + "required": false, + "scope": "datasource" + }, + { + "name": "password", + "type": "string", + "description": "Database password", + "required": false, + "scope": "datasource" + }, + { + "name": "driver", + "type": "string", + "description": "JDBC driver class", + "required": false, + "scope": "datasource", + "default": "com.mysql.cj.jdbc.Driver" + }, + { + "name": "dbtable", + "type": "string", + "description": "Target table", + "required": false, + "scope": "step" + } + ], + "examples": [ + { + "format": "yaml", + "code": "connection:\n type: mysql\n options:\n url: \"jdbc:mysql://localhost:3306/mydb\"\n user: \"root\"\n password: \"${MYSQL_PASSWORD}\"", + "title": "MySQL connection" + } + ], + "tags": [ + "database", + "jdbc", + "relational", + "sql" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.databases.cassandra", + "name": "Cassandra Connector", + "category": "connectors", + "status": "stable", + "description": "Connect to Apache Cassandra for reading and writing data. Supports keyspace/table configuration, primary key and clustering positions.", + "subcategory": "databases", + "configuration": [ + { + "name": "url", + "type": "string", + "description": "Cassandra contact point URL", + "required": true, + "scope": "datasource" + }, + { + "name": "user", + "type": "string", + "description": "Cassandra username", + "required": false, + "scope": "datasource" + }, + { + "name": "password", + "type": "string", + "description": "Cassandra password", + "required": false, + "scope": "datasource" + }, + { + "name": "keyspace", + "type": "string", + "description": "Cassandra keyspace", + "required": true, + "scope": "step" + }, + { + "name": "table", + "type": "string", + "description": "Cassandra table", + "required": true, + "scope": "step" + } + ], + "examples": [ + { + "format": "yaml", + "code": "connection:\n type: cassandra\n options:\n url: \"localhost:9042\"\n user: \"cassandra\"\n password: \"cassandra\"", + "title": "Cassandra connection" + } + ], + "tags": [ + "database", + "nosql", + "wide-column" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.databases.bigquery", + "name": "BigQuery Connector", + "category": "connectors", + "status": "stable", + "description": "Connect to Google BigQuery for reading and writing data. Supports direct and indirect write methods.", + "subcategory": "databases", + "configuration": [ + { + "name": "table", + "type": "string", + "description": "BigQuery table (project.dataset.table)", + "required": true, + "scope": "step" + }, + { + "name": "credentialsFile", + "type": "string", + "description": "Path to GCP credentials JSON", + "required": false, + "scope": "datasource" + }, + { + "name": "writeMethod", + "type": "string", + "description": "Write method", + "required": false, + "scope": "datasource", + "default": "indirect", + "validValues": [ + "direct", + "indirect" + ] + }, + { + "name": "temporaryGcsBucket", + "type": "string", + "description": "GCS bucket for indirect writes", + "required": false, + "scope": "datasource" + }, + { + "name": "queryJobPriority", + "type": "string", + "description": "Query job priority", + "required": false, + "scope": "datasource", + "default": "batch" + } + ], + "tags": [ + "database", + "cloud", + "google", + "data-warehouse" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.files.csv", + "name": "CSV File Connector", + "category": "connectors", + "status": "stable", + "description": "Read and write CSV files. Supports headers, delimiters, and other CSV-specific options.", + "subcategory": "files", + "configuration": [ + { + "name": "path", + "type": "string", + "description": "File system path for CSV files", + "required": true, + "scope": "datasource", + "yamlPath": "dataSources[].connection.options.path" + } + ], + "examples": [ + { + "format": "yaml", + "code": "connection:\n type: csv\n options:\n path: \"/tmp/data/csv-output\"", + "title": "CSV file output" + } + ], + "tags": [ + "file", + "csv", + "delimited", + "text" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.files.json", + "name": "JSON File Connector", + "category": "connectors", + "status": "stable", + "description": "Read and write JSON files. Supports nested structures, arrays, and unwrapping top-level arrays.", + "subcategory": "files", + "configuration": [ + { + "name": "path", + "type": "string", + "description": "File system path for JSON files", + "required": true, + "scope": "datasource" + }, + { + "name": "unwrapTopLevelArray", + "type": "boolean", + "description": "Output JSON as root-level array instead of object", + "required": false, + "scope": "step", + "default": false + } + ], + "examples": [ + { + "format": "yaml", + "code": "connection:\n type: json\n options:\n path: \"/tmp/data/json-output\"", + "title": "JSON file output" + } + ], + "tags": [ + "file", + "json", + "structured" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.files.parquet", + "name": "Parquet File Connector", + "category": "connectors", + "status": "stable", + "description": "Read and write Apache Parquet columnar files. Efficient for large datasets.", + "subcategory": "files", + "configuration": [ + { + "name": "path", + "type": "string", + "description": "File system path for Parquet files", + "required": true, + "scope": "datasource" + } + ], + "tags": [ + "file", + "parquet", + "columnar", + "binary" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.files.orc", + "name": "ORC File Connector", + "category": "connectors", + "status": "stable", + "description": "Read and write Apache ORC columnar files.", + "subcategory": "files", + "configuration": [ + { + "name": "path", + "type": "string", + "description": "File system path for ORC files", + "required": true, + "scope": "datasource" + } + ], + "tags": [ + "file", + "orc", + "columnar", + "binary" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.files.delta", + "name": "Delta Lake Connector", + "category": "connectors", + "status": "stable", + "description": "Read and write Delta Lake tables. Supports ACID transactions, time travel, and schema evolution.", + "subcategory": "files", + "configuration": [ + { + "name": "path", + "type": "string", + "description": "File system path for Delta tables", + "required": true, + "scope": "datasource" + } + ], + "tags": [ + "file", + "delta", + "lakehouse", + "acid" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.files.iceberg", + "name": "Apache Iceberg Connector", + "category": "connectors", + "status": "stable", + "description": "Read and write Apache Iceberg tables. Supports multiple catalog types (Hadoop, Hive, REST, Glue, JDBC, Nessie).", + "subcategory": "files", + "configuration": [ + { + "name": "path", + "type": "string", + "description": "Table path", + "required": true, + "scope": "datasource" + }, + { + "name": "catalogType", + "type": "string", + "description": "Iceberg catalog type", + "required": false, + "scope": "datasource", + "default": "hadoop", + "validValues": [ + "hadoop", + "hive", + "rest", + "glue", + "jdbc", + "nessie" + ] + }, + { + "name": "catalogUri", + "type": "string", + "description": "Catalog URI (for hive/rest/nessie)", + "required": false, + "scope": "datasource" + }, + { + "name": "catalogDefaultNamespace", + "type": "string", + "description": "Default namespace", + "required": false, + "scope": "datasource" + } + ], + "tags": [ + "file", + "iceberg", + "lakehouse", + "catalog" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.files.hudi", + "name": "Apache Hudi Connector", + "category": "connectors", + "status": "stable", + "description": "Read and write Apache Hudi tables.", + "subcategory": "files", + "configuration": [ + { + "name": "path", + "type": "string", + "description": "Table path", + "required": true, + "scope": "datasource" + }, + { + "name": "hoodie.table.name", + "type": "string", + "description": "Hudi table name", + "required": true, + "scope": "step" + } + ], + "tags": [ + "file", + "hudi", + "lakehouse" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.files.xml", + "name": "XML File Connector", + "category": "connectors", + "status": "stable", + "description": "Read and write XML files.", + "subcategory": "files", + "configuration": [ + { + "name": "path", + "type": "string", + "description": "File system path for XML files", + "required": true, + "scope": "datasource" + } + ], + "tags": [ + "file", + "xml", + "structured" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.messaging.kafka", + "name": "Apache Kafka Connector", + "category": "connectors", + "status": "stable", + "description": "Connect to Apache Kafka for producing and consuming messages. Supports topics, partitions, headers, key/value serialization, and streaming patterns.", + "subcategory": "messaging", + "configuration": [ + { + "name": "url", + "type": "string", + "description": "Kafka bootstrap servers", + "required": true, + "scope": "datasource", + "yamlPath": "dataSources[].connection.options.url" + }, + { + "name": "topic", + "type": "string", + "description": "Kafka topic name", + "required": true, + "scope": "step" + }, + { + "name": "schemaLocation", + "type": "string", + "description": "Schema registry URL or file path", + "required": false, + "scope": "datasource" + } + ], + "examples": [ + { + "format": "yaml", + "code": "dataSources:\n - name: my_kafka\n connection:\n type: kafka\n options:\n url: \"localhost:9092\"\n steps:\n - name: orders_topic\n options:\n topic: \"orders\"\n count:\n duration: \"1m\"\n rate: 100\n rateUnit: \"second\"", + "title": "Kafka streaming" + } + ], + "tags": [ + "messaging", + "kafka", + "streaming", + "event" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.messaging.solace", + "name": "Solace JMS Connector", + "category": "connectors", + "status": "stable", + "description": "Connect to Solace PubSub+ message broker via JMS. Supports queues and topics.", + "subcategory": "messaging", + "configuration": [ + { + "name": "url", + "type": "string", + "description": "Solace broker URL", + "required": true, + "scope": "datasource" + }, + { + "name": "user", + "type": "string", + "description": "Username", + "required": false, + "scope": "datasource" + }, + { + "name": "password", + "type": "string", + "description": "Password", + "required": false, + "scope": "datasource" + }, + { + "name": "vpnName", + "type": "string", + "description": "VPN name", + "required": false, + "scope": "datasource", + "default": "default" + }, + { + "name": "connectionFactory", + "type": "string", + "description": "JNDI connection factory", + "required": false, + "scope": "datasource", + "default": "/jms/cf/default" + }, + { + "name": "initialContextFactory", + "type": "string", + "description": "JNDI context factory", + "required": false, + "scope": "datasource" + }, + { + "name": "destinationName", + "type": "string", + "description": "Queue/topic destination", + "required": true, + "scope": "step" + } + ], + "tags": [ + "messaging", + "jms", + "solace" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.messaging.rabbitmq", + "name": "RabbitMQ Connector", + "category": "connectors", + "status": "stable", + "description": "Connect to RabbitMQ message broker via JMS. Supports queues.", + "subcategory": "messaging", + "configuration": [ + { + "name": "url", + "type": "string", + "description": "RabbitMQ URL", + "required": true, + "scope": "datasource" + }, + { + "name": "user", + "type": "string", + "description": "Username", + "required": false, + "scope": "datasource", + "default": "guest" + }, + { + "name": "password", + "type": "string", + "description": "Password", + "required": false, + "scope": "datasource", + "default": "guest" + }, + { + "name": "virtualHost", + "type": "string", + "description": "Virtual host", + "required": false, + "scope": "datasource", + "default": "/" + }, + { + "name": "destinationName", + "type": "string", + "description": "Queue name", + "required": true, + "scope": "step" + } + ], + "tags": [ + "messaging", + "rabbitmq", + "jms", + "amqp" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "connector.http.http", + "name": "HTTP/REST API Connector", + "category": "connectors", + "status": "stable", + "description": "Send generated data to HTTP/REST APIs. Supports custom methods, headers, URL path parameters, query parameters, and request bodies.", + "subcategory": "http", + "configuration": [ + { + "name": "url", + "type": "string", + "description": "Base URL for HTTP requests", + "required": true, + "scope": "datasource" + } + ], + "examples": [ + { + "format": "yaml", + "code": "dataSources:\n - name: my_api\n connection:\n type: http\n options:\n url: \"http://localhost:8080\"\n steps:\n - name: create_users\n fields:\n - name: httpUrl\n type: struct\n fields:\n - name: url\n static: \"http://localhost:8080/api/users\"\n - name: method\n static: \"POST\"\n - name: httpBody\n type: struct\n fields:\n - name: name\n options:\n expression: \"#{Name.fullName}\"", + "title": "HTTP API data generation" + } + ], + "tags": [ + "http", + "rest", + "api", + "web" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala", + "role": "supporting" + } + ] + }, + { + "id": "generation.field.regex", + "name": "Regex Pattern Generation", + "category": "generation", + "status": "stable", + "description": "Generate string values matching a regular expression pattern. Supports SQL-based optimization for common patterns with automatic fallback to UDF for complex patterns (lookaheads, backreferences).", + "subcategory": "generators", + "configuration": [ + { + "name": "regex", + "type": "string", + "description": "Regular expression pattern to generate values from", + "required": true, + "scope": "field", + "yamlPath": "fields[].options.regex" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: account_id\n options:\n regex: \"ACC[0-9]{8}\"", + "title": "Simple regex pattern" + }, + { + "format": "yaml", + "code": "- name: product_code\n options:\n regex: \"[A-Z]{3}-[0-9]{4}\"", + "title": "Alphanumeric pattern" + }, + { + "format": "scala", + "code": "field.name(\"account_id\").regex(\"ACC[0-9]{8}\")", + "title": "Scala API" + } + ], + "tags": [ + "generation", + "string", + "pattern", + "regex" + ], + "relatedFeatures": [ + "configuration.flags.enable_fast_generation" + ], + "sourceFiles": [ + { + "path": "app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/regex/RegexPatternParser.scala", + "role": "primary" + } + ], + "performanceNotes": [ + "SQL-based optimization available via enableFastGeneration flag", + "Complex patterns (lookaheads, backreferences) automatically fall back to UDF" + ] + }, + { + "id": "generation.field.expression", + "name": "DataFaker Expression", + "category": "generation", + "status": "stable", + "description": "Generate realistic fake data using DataFaker library expressions. Supports names, addresses, emails, phone numbers, and hundreds of other data types.", + "subcategory": "generators", + "configuration": [ + { + "name": "expression", + "type": "string", + "description": "DataFaker expression (e.g., #{Name.firstName})", + "required": true, + "scope": "field", + "yamlPath": "fields[].options.expression" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: full_name\n options:\n expression: \"#{Name.fullName}\"", + "title": "Full name generation" + }, + { + "format": "yaml", + "code": "- name: email\n options:\n expression: \"#{Internet.emailAddress}\"", + "title": "Email generation" + }, + { + "format": "scala", + "code": "field.name(\"email\").expression(\"#{Internet.emailAddress}\")", + "title": "Scala API" + } + ], + "tags": [ + "generation", + "faker", + "realistic", + "expression" + ], + "dependencies": { + "libraries": [ + "net.datafaker:datafaker" + ] + } + }, + { + "id": "generation.field.one_of", + "name": "One-Of Selection", + "category": "generation", + "status": "stable", + "description": "Generate values by randomly selecting from a predefined list of options. Useful for categorical data like statuses, types, and enums.", + "subcategory": "generators", + "configuration": [ + { + "name": "oneOf", + "type": "array", + "description": "List of values to randomly select from", + "required": true, + "scope": "field", + "yamlPath": "fields[].options.oneOf" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: status\n options:\n oneOf: [\"active\", \"inactive\", \"pending\"]", + "title": "Enum field" + }, + { + "format": "scala", + "code": "field.name(\"status\").oneOf(\"active\", \"inactive\", \"pending\")", + "title": "Scala API" + } + ], + "tags": [ + "generation", + "enum", + "categorical", + "selection" + ] + }, + { + "id": "generation.field.sql", + "name": "SQL Expression", + "category": "generation", + "status": "stable", + "description": "Generate field values using Spark SQL expressions. Supports referencing other fields, date functions, string operations, aggregations, and conditional logic.", + "subcategory": "generators", + "configuration": [ + { + "name": "sql", + "type": "string", + "description": "Spark SQL expression for computed value", + "required": true, + "scope": "field", + "yamlPath": "fields[].options.sql" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: year\n type: integer\n options:\n sql: \"YEAR(created_at)\"", + "title": "Extract year from date field" + }, + { + "format": "yaml", + "code": "- name: full_name\n type: string\n options:\n sql: \"CONCAT(first_name, ' ', last_name)\"", + "title": "Concatenate fields" + }, + { + "format": "yaml", + "code": "- name: total_amount\n type: double\n options:\n sql: \"quantity * unit_price\"", + "title": "Computed field" + } + ], + "tags": [ + "generation", + "sql", + "computed", + "derived" + ] + }, + { + "id": "generation.field.static", + "name": "Static Value", + "category": "generation", + "status": "stable", + "description": "Set a fixed static value for all generated records. Useful for constant fields like API endpoints, methods, or content types.", + "subcategory": "generators", + "configuration": [ + { + "name": "static", + "type": "string", + "description": "Fixed value for all records", + "required": true, + "scope": "field", + "yamlPath": "fields[].static" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: method\n static: \"POST\"", + "title": "Static HTTP method" + } + ], + "tags": [ + "generation", + "static", + "constant" + ] + }, + { + "id": "generation.field.uuid", + "name": "UUID Generation", + "category": "generation", + "status": "stable", + "description": "Generate universally unique identifiers (UUID v4).", + "subcategory": "generators", + "configuration": [ + { + "name": "uuidPattern", + "type": "boolean", + "description": "Enable UUID generation", + "required": false, + "scope": "field", + "default": false, + "yamlPath": "fields[].options.uuidPattern" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: id\n options:\n uuidPattern: true", + "title": "UUID field" + } + ], + "tags": [ + "generation", + "uuid", + "identifier", + "unique" + ] + }, + { + "id": "generation.field.sequence", + "name": "Sequential Value Generation", + "category": "generation", + "status": "stable", + "description": "Generate sequential values with optional prefix and padding. Useful for IDs, batch numbers, and sequential identifiers.", + "subcategory": "generators", + "configuration": [ + { + "name": "sequence", + "type": "object", + "description": "Sequential value configuration with prefix and padding", + "required": true, + "scope": "field", + "yamlPath": "fields[].options.sequence" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: order_id\n options:\n sequence:\n start: 1000\n step: 1\n prefix: \"ORD-\"\n padding: 8", + "title": "Sequential order IDs" + } + ], + "tags": [ + "generation", + "sequence", + "sequential", + "incremental" + ] + }, + { + "id": "generation.field.conditional_value", + "name": "Conditional Value Generation", + "category": "generation", + "status": "stable", + "description": "Generate values using CASE WHEN logic based on other field values. Enables dependent field generation.", + "subcategory": "generators", + "configuration": [ + { + "name": "conditionalValue", + "type": "object", + "description": "CASE WHEN conditions and result values", + "required": true, + "scope": "field", + "yamlPath": "fields[].options.conditionalValue" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: discount\n type: double\n options:\n conditionalValue:\n conditions:\n - expr: \"customer_type = 'premium'\"\n value: 0.2\n - expr: \"customer_type = 'standard'\"\n value: 0.1\n default: 0.0", + "title": "Conditional discount" + } + ], + "tags": [ + "generation", + "conditional", + "logic", + "derived" + ] + }, + { + "id": "generation.field.correlated", + "name": "Correlated Field Generation", + "category": "generation", + "status": "stable", + "description": "Generate values that are correlated (or negatively correlated) with another field. Useful for creating realistic relationships between numeric fields.", + "subcategory": "generators", + "configuration": [ + { + "name": "correlatedWith", + "type": "string", + "description": "Field name to correlate with", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.correlatedWith" + }, + { + "name": "negativelyCorrelatedWith", + "type": "string", + "description": "Field name to negatively correlate with", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.negativelyCorrelatedWith" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: revenue\n type: double\n options:\n correlatedWith: \"customer_count\"", + "title": "Positively correlated fields" + } + ], + "tags": [ + "generation", + "correlation", + "statistical", + "relationship" + ] + }, + { + "id": "generation.field.mapping", + "name": "Value Mapping", + "category": "generation", + "status": "stable", + "description": "Map values from one field to generate deterministic output in another field.", + "subcategory": "generators", + "configuration": [ + { + "name": "mapping", + "type": "object", + "description": "Mapping configuration from source field to output values", + "required": true, + "scope": "field", + "yamlPath": "fields[].options.mapping" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: country_code\n options:\n mapping:\n sourceField: \"country\"\n mappings:\n \"United States\": \"US\"\n \"United Kingdom\": \"UK\"", + "title": "Country code mapping" + } + ], + "tags": [ + "generation", + "mapping", + "lookup", + "derived" + ] + }, + { + "id": "generation.type.string", + "name": "String Type", + "category": "generation", + "status": "stable", + "description": "Text data type. Default field type.", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"string\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "string" + ] + }, + { + "id": "generation.type.integer", + "name": "Integer Type", + "category": "generation", + "status": "stable", + "description": "32-bit integer values.", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"integer\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "integer" + ] + }, + { + "id": "generation.type.long", + "name": "Long Type", + "category": "generation", + "status": "stable", + "description": "64-bit long integer values.", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"long\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "long" + ] + }, + { + "id": "generation.type.double", + "name": "Double Type", + "category": "generation", + "status": "stable", + "description": "Double-precision floating point values.", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"double\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "double" + ] + }, + { + "id": "generation.type.float", + "name": "Float Type", + "category": "generation", + "status": "stable", + "description": "Single-precision floating point values.", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"float\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "float" + ] + }, + { + "id": "generation.type.decimal", + "name": "Decimal Type", + "category": "generation", + "status": "stable", + "description": "Fixed-precision decimal values with configurable precision and scale.", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"decimal\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "decimal" + ] + }, + { + "id": "generation.type.boolean", + "name": "Boolean Type", + "category": "generation", + "status": "stable", + "description": "True/false boolean values.", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"boolean\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "boolean" + ] + }, + { + "id": "generation.type.date", + "name": "Date Type", + "category": "generation", + "status": "stable", + "description": "Date values (year-month-day).", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"date\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "date" + ] + }, + { + "id": "generation.type.timestamp", + "name": "Timestamp Type", + "category": "generation", + "status": "stable", + "description": "Timestamp values with date and time.", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"timestamp\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "timestamp" + ] + }, + { + "id": "generation.type.binary", + "name": "Binary Type", + "category": "generation", + "status": "stable", + "description": "Binary byte array values.", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"binary\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "binary" + ] + }, + { + "id": "generation.type.array", + "name": "Array Type", + "category": "generation", + "status": "stable", + "description": "Array/list of elements. Configurable element type, min/max length.", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"array\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "array" + ] + }, + { + "id": "generation.type.struct", + "name": "Struct Type", + "category": "generation", + "status": "stable", + "description": "Nested structure with named fields. Supports deep nesting.", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"struct\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "struct" + ] + }, + { + "id": "generation.type.map", + "name": "Map Type", + "category": "generation", + "status": "stable", + "description": "Key-value map type with configurable key and value types.", + "subcategory": "data_types", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set field type to \"map\"", + "required": false, + "scope": "field", + "yamlPath": "fields[].type" + } + ], + "tags": [ + "generation", + "type", + "map" + ] + }, + { + "id": "generation.option.numeric_range", + "name": "Numeric Range", + "category": "generation", + "status": "stable", + "description": "Constrain numeric fields (integer, long, double, float, decimal) to a minimum and maximum range.", + "subcategory": "field_options", + "configuration": [ + { + "name": "min", + "type": "any", + "description": "Minimum value (inclusive)", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.min", + "scalaConstant": "MINIMUM" + }, + { + "name": "max", + "type": "any", + "description": "Maximum value (inclusive)", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.max", + "scalaConstant": "MAXIMUM" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: age\n type: integer\n options:\n min: 18\n max: 120", + "title": "Integer range" + }, + { + "format": "yaml", + "code": "- name: price\n type: double\n options:\n min: 9.99\n max: 999.99", + "title": "Double range" + } + ], + "tags": [ + "generation", + "numeric", + "range", + "constraint" + ] + }, + { + "id": "generation.option.date_range", + "name": "Date/Time Range", + "category": "generation", + "status": "stable", + "description": "Constrain date and timestamp fields to a minimum and maximum range. Also supports excluding weekends, business hours, within/future days.", + "subcategory": "field_options", + "configuration": [ + { + "name": "min", + "type": "string", + "description": "Minimum date/timestamp", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.min" + }, + { + "name": "max", + "type": "string", + "description": "Maximum date/timestamp", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.max" + }, + { + "name": "excludeWeekends", + "type": "boolean", + "description": "Exclude Saturday and Sunday", + "required": false, + "scope": "field", + "default": false, + "yamlPath": "fields[].options.excludeWeekends", + "scalaConstant": "DATE_EXCLUDE_WEEKENDS" + }, + { + "name": "withinDays", + "type": "integer", + "description": "Generate dates within last N days from now", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.withinDays" + }, + { + "name": "futureDays", + "type": "integer", + "description": "Generate dates within next N days from now", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.futureDays" + }, + { + "name": "businessHours", + "type": "boolean", + "description": "Restrict to business hours", + "required": false, + "scope": "field", + "default": false, + "yamlPath": "fields[].options.businessHours" + }, + { + "name": "timeBetween", + "type": "object", + "description": "Generate times between start and end", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.timeBetween" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: created_at\n type: timestamp\n options:\n min: \"2024-01-01T00:00:00\"\n max: \"2024-12-31T23:59:59\"", + "title": "Timestamp range" + } + ], + "tags": [ + "generation", + "date", + "timestamp", + "range" + ] + }, + { + "id": "generation.option.null_handling", + "name": "Null Value Control", + "category": "generation", + "status": "stable", + "description": "Control whether and how often null values appear in generated data. Configurable null probability per field.", + "subcategory": "field_options", + "configuration": [ + { + "name": "enableNull", + "type": "boolean", + "description": "Allow null values for this field", + "required": false, + "scope": "field", + "default": false, + "yamlPath": "fields[].options.enableNull", + "scalaConstant": "ENABLED_NULL" + }, + { + "name": "nullProb", + "type": "double", + "description": "Probability of generating a null value (0-1)", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.nullProb", + "scalaConstant": "PROBABILITY_OF_NULL" + }, + { + "name": "nullable", + "type": "boolean", + "description": "Whether the field schema allows nulls", + "required": false, + "scope": "field", + "default": true, + "yamlPath": "fields[].nullable" + } + ], + "examples": [ + { + "format": "yaml", + "code": "- name: middle_name\n options:\n enableNull: true\n nullProb: 0.3", + "title": "30% null probability" + } + ], + "tags": [ + "generation", + "null", + "nullable", + "probability" + ] + }, + { + "id": "generation.option.edge_cases", + "name": "Edge Case Generation", + "category": "generation", + "status": "stable", + "description": "Control the probability of generating edge case values (empty strings, boundary values, special characters).", + "subcategory": "field_options", + "configuration": [ + { + "name": "enableEdgeCase", + "type": "boolean", + "description": "Enable edge case generation", + "required": false, + "scope": "field", + "default": false, + "scalaConstant": "ENABLED_EDGE_CASE" + }, + { + "name": "edgeCaseProb", + "type": "double", + "description": "Probability of generating edge case values (0-1)", + "required": false, + "scope": "field", + "scalaConstant": "PROBABILITY_OF_EDGE_CASE" + } + ], + "tags": [ + "generation", + "edge-case", + "boundary", + "testing" + ] + }, + { + "id": "generation.option.string_length", + "name": "String Length Control", + "category": "generation", + "status": "stable", + "description": "Control the length of generated string values with minimum, maximum, and average length.", + "subcategory": "field_options", + "configuration": [ + { + "name": "minLen", + "type": "integer", + "description": "Minimum string length", + "required": false, + "scope": "field", + "scalaConstant": "MINIMUM_LENGTH" + }, + { + "name": "maxLen", + "type": "integer", + "description": "Maximum string length", + "required": false, + "scope": "field", + "scalaConstant": "MAXIMUM_LENGTH" + }, + { + "name": "avgLen", + "type": "integer", + "description": "Average string length", + "required": false, + "scope": "field", + "scalaConstant": "AVERAGE_LENGTH" + } + ], + "tags": [ + "generation", + "string", + "length", + "constraint" + ] + }, + { + "id": "generation.option.array_config", + "name": "Array Configuration", + "category": "generation", + "status": "stable", + "description": "Configure array field generation: element count, element type, uniqueness, empty probability, and weighted selection.", + "subcategory": "field_options", + "configuration": [ + { + "name": "arrayMinLen", + "type": "integer", + "description": "Minimum array length", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.arrayMinLength", + "scalaConstant": "ARRAY_MINIMUM_LENGTH" + }, + { + "name": "arrayMaxLen", + "type": "integer", + "description": "Maximum array length", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.arrayMaxLength", + "scalaConstant": "ARRAY_MAXIMUM_LENGTH" + }, + { + "name": "arrayFixedSize", + "type": "integer", + "description": "Fixed array size", + "required": false, + "scope": "field", + "scalaConstant": "ARRAY_FIXED_SIZE" + }, + { + "name": "arrayEmptyProb", + "type": "double", + "description": "Probability of empty array (0-1)", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.arrayEmptyProbability", + "scalaConstant": "ARRAY_EMPTY_PROBABILITY" + }, + { + "name": "arrayType", + "type": "string", + "description": "Element data type for array", + "required": false, + "scope": "field", + "scalaConstant": "ARRAY_TYPE" + }, + { + "name": "arrayOneOf", + "type": "string", + "description": "Comma-separated values for array elements", + "required": false, + "scope": "field", + "scalaConstant": "ARRAY_ONE_OF" + }, + { + "name": "arrayUniqueFrom", + "type": "string", + "description": "Source for unique array elements", + "required": false, + "scope": "field", + "scalaConstant": "ARRAY_UNIQUE_FROM" + }, + { + "name": "arrayWeightedOneOf", + "type": "string", + "description": "Weighted selection for elements (e.g., HIGH:0.2,MEDIUM:0.5,LOW:0.3)", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.arrayWeightedOneOf", + "scalaConstant": "ARRAY_WEIGHTED_ONE_OF" + } + ], + "tags": [ + "generation", + "array", + "collection", + "nested" + ] + }, + { + "id": "generation.option.map_config", + "name": "Map Configuration", + "category": "generation", + "status": "stable", + "description": "Configure map field generation with minimum and maximum size.", + "subcategory": "field_options", + "configuration": [ + { + "name": "mapMinSize", + "type": "integer", + "description": "Minimum number of entries", + "required": false, + "scope": "field", + "scalaConstant": "MAP_MINIMUM_SIZE" + }, + { + "name": "mapMaxSize", + "type": "integer", + "description": "Maximum number of entries", + "required": false, + "scope": "field", + "scalaConstant": "MAP_MAXIMUM_SIZE" + } + ], + "tags": [ + "generation", + "map", + "key-value", + "nested" + ] + }, + { + "id": "generation.option.distribution", + "name": "Value Distribution", + "category": "generation", + "status": "stable", + "description": "Control the statistical distribution of generated numeric values. Supports uniform, normal, and exponential distributions.", + "subcategory": "field_options", + "configuration": [ + { + "name": "distribution", + "type": "enum", + "description": "Distribution type", + "required": false, + "scope": "field", + "validValues": [ + "uniform", + "normal", + "exponential" + ], + "scalaConstant": "DISTRIBUTION" + }, + { + "name": "mean", + "type": "double", + "description": "Mean for normal distribution", + "required": false, + "scope": "field", + "scalaConstant": "MEAN" + }, + { + "name": "stddev", + "type": "double", + "description": "Standard deviation for normal distribution", + "required": false, + "scope": "field", + "scalaConstant": "STANDARD_DEVIATION" + }, + { + "name": "distributionRateParam", + "type": "double", + "description": "Rate parameter for exponential distribution", + "required": false, + "scope": "field", + "scalaConstant": "DISTRIBUTION_RATE_PARAMETER" + } + ], + "tags": [ + "generation", + "distribution", + "statistical", + "normal", + "uniform" + ] + }, + { + "id": "generation.option.uniqueness", + "name": "Uniqueness Constraint", + "category": "generation", + "status": "stable", + "description": "Enforce unique values for a field using bloom filter-based deduplication.", + "subcategory": "field_options", + "configuration": [ + { + "name": "isUnique", + "type": "boolean", + "description": "Enforce unique values", + "required": false, + "scope": "field", + "default": false, + "yamlPath": "fields[].options.isUnique", + "scalaConstant": "IS_UNIQUE" + }, + { + "name": "isPrimaryKey", + "type": "boolean", + "description": "Mark as primary key (implies unique)", + "required": false, + "scope": "field", + "default": false, + "yamlPath": "fields[].options.isPrimaryKey", + "scalaConstant": "IS_PRIMARY_KEY" + }, + { + "name": "primaryKeyPos", + "type": "integer", + "description": "Position in composite primary key", + "required": false, + "scope": "field", + "scalaConstant": "PRIMARY_KEY_POSITION" + } + ], + "tags": [ + "generation", + "unique", + "primary-key", + "constraint" + ], + "relatedFeatures": [ + "configuration.flags.enable_unique_check" + ] + }, + { + "id": "generation.option.numeric_precision", + "name": "Numeric Precision and Scale", + "category": "generation", + "status": "stable", + "description": "Control precision and scale for decimal fields, and rounding for numeric fields.", + "subcategory": "field_options", + "configuration": [ + { + "name": "precision", + "type": "integer", + "description": "Numeric precision (total digits)", + "required": false, + "scope": "field", + "scalaConstant": "NUMERIC_PRECISION" + }, + { + "name": "scale", + "type": "integer", + "description": "Numeric scale (decimal places)", + "required": false, + "scope": "field", + "scalaConstant": "NUMERIC_SCALE" + }, + { + "name": "round", + "type": "integer", + "description": "Round numeric values to N decimal places", + "required": false, + "scope": "field", + "scalaConstant": "ROUND" + } + ], + "tags": [ + "generation", + "numeric", + "precision", + "decimal" + ] + }, + { + "id": "generation.option.omit", + "name": "Field Omission", + "category": "generation", + "status": "stable", + "description": "Generate a field for use in computed expressions but omit it from the final output.", + "subcategory": "field_options", + "configuration": [ + { + "name": "omit", + "type": "boolean", + "description": "Omit field from output", + "required": false, + "scope": "field", + "default": false, + "yamlPath": "fields[].options.omit", + "scalaConstant": "OMIT" + } + ], + "tags": [ + "generation", + "omit", + "helper", + "computed" + ] + }, + { + "id": "generation.option.seed", + "name": "Random Seed", + "category": "generation", + "status": "stable", + "description": "Set a random seed for reproducible data generation per field.", + "subcategory": "field_options", + "configuration": [ + { + "name": "seed", + "type": "integer", + "description": "Random seed for reproducible generation", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.seed", + "scalaConstant": "RANDOM_SEED" + } + ], + "tags": [ + "generation", + "seed", + "reproducible", + "deterministic" + ] + }, + { + "id": "generation.option.distinct_count", + "name": "Distinct Value Count", + "category": "generation", + "status": "stable", + "description": "Control how many distinct values are generated for a field. Used with metadata-driven generation.", + "subcategory": "field_options", + "configuration": [ + { + "name": "distinctCount", + "type": "integer", + "description": "Number of distinct values to generate", + "required": false, + "scope": "field", + "scalaConstant": "DISTINCT_COUNT" + }, + { + "name": "histogram", + "type": "object", + "description": "Value distribution histogram", + "required": false, + "scope": "field", + "scalaConstant": "HISTOGRAM" + } + ], + "tags": [ + "generation", + "distinct", + "cardinality", + "metadata" + ] + }, + { + "id": "generation.option.cassandra_keys", + "name": "Cassandra Key Configuration", + "category": "generation", + "status": "stable", + "description": "Configure Cassandra-specific primary key and clustering positions for fields.", + "subcategory": "field_options", + "configuration": [ + { + "name": "isPrimaryKey", + "type": "boolean", + "description": "Mark as partition key", + "required": false, + "scope": "field", + "scalaConstant": "IS_PRIMARY_KEY" + }, + { + "name": "primaryKeyPos", + "type": "integer", + "description": "Position in composite partition key", + "required": false, + "scope": "field", + "scalaConstant": "PRIMARY_KEY_POSITION" + }, + { + "name": "clusteringPos", + "type": "integer", + "description": "Clustering column position", + "required": false, + "scope": "field", + "scalaConstant": "CLUSTERING_POSITION" + } + ], + "tags": [ + "generation", + "cassandra", + "primary-key", + "clustering" + ] + }, + { + "id": "generation.option.incremental", + "name": "Incremental Generation", + "category": "generation", + "status": "stable", + "description": "Mark a field for incremental generation, tracking the last generated value across runs.", + "subcategory": "field_options", + "configuration": [ + { + "name": "incremental", + "type": "boolean", + "description": "Enable incremental mode", + "required": false, + "scope": "field", + "default": false, + "scalaConstant": "INCREMENTAL" + } + ], + "tags": [ + "generation", + "incremental", + "tracking" + ] + }, + { + "id": "generation.option.http_param_type", + "name": "HTTP Parameter Type", + "category": "generation", + "status": "stable", + "description": "Specify the HTTP parameter type for a field when using the HTTP connector (path, query, or header).", + "subcategory": "field_options", + "configuration": [ + { + "name": "httpParamType", + "type": "enum", + "description": "HTTP parameter placement", + "required": false, + "scope": "field", + "validValues": [ + "path", + "query", + "header" + ], + "scalaConstant": "HTTP_PARAMETER_TYPE" + } + ], + "tags": [ + "generation", + "http", + "parameter", + "api" + ] + }, + { + "id": "generation.option.post_sql_expression", + "name": "Post-SQL Expression", + "category": "generation", + "status": "stable", + "description": "Apply a SQL expression to transform the field value after initial generation.", + "subcategory": "field_options", + "configuration": [ + { + "name": "postSqlExpression", + "type": "string", + "description": "SQL expression to apply after generation", + "required": false, + "scope": "field", + "scalaConstant": "POST_SQL_EXPRESSION" + } + ], + "tags": [ + "generation", + "sql", + "transform", + "post-processing" + ] + }, + { + "id": "generation.field.semantic_version", + "name": "Semantic Version Generation", + "category": "generation", + "status": "stable", + "description": "Generate semantic version strings (e.g., 1.2.3).", + "subcategory": "generators", + "configuration": [ + { + "name": "semanticVersion", + "type": "object", + "description": "Semantic version configuration", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.semanticVersion" + } + ], + "tags": [ + "generation", + "version", + "semver" + ] + }, + { + "id": "generation.field.daily_batch_sequence", + "name": "Daily Batch Sequence", + "category": "generation", + "status": "stable", + "description": "Generate daily batch sequence identifiers.", + "subcategory": "generators", + "configuration": [ + { + "name": "dailyBatchSequence", + "type": "object", + "description": "Daily batch sequence configuration", + "required": false, + "scope": "field", + "yamlPath": "fields[].options.dailyBatchSequence" + } + ], + "tags": [ + "generation", + "batch", + "daily", + "sequence" + ] + }, + { + "id": "generation.label.name", + "name": "Name Label", + "category": "generation", + "status": "stable", + "description": "Generate person name fields (first name, last name, full name). Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"name\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "name" + ] + }, + { + "id": "generation.label.username", + "name": "Username Label", + "category": "generation", + "status": "stable", + "description": "Generate username fields. Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"username\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "username" + ] + }, + { + "id": "generation.label.address", + "name": "Address Label", + "category": "generation", + "status": "stable", + "description": "Generate address fields (street, city, postcode). Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"address\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "address" + ] + }, + { + "id": "generation.label.app", + "name": "Application Label", + "category": "generation", + "status": "stable", + "description": "Generate application-related fields (version). Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"app\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "app" + ] + }, + { + "id": "generation.label.nation", + "name": "Nation Label", + "category": "generation", + "status": "stable", + "description": "Generate nationality, language, capital city. Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"nation\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "nation" + ] + }, + { + "id": "generation.label.money", + "name": "Money Label", + "category": "generation", + "status": "stable", + "description": "Generate currency and financial fields. Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"money\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "money" + ] + }, + { + "id": "generation.label.internet", + "name": "Internet Label", + "category": "generation", + "status": "stable", + "description": "Generate email, IP, MAC address fields. Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"internet\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "internet" + ] + }, + { + "id": "generation.label.food", + "name": "Food Label", + "category": "generation", + "status": "stable", + "description": "Generate food and ingredient fields. Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"food\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "food" + ] + }, + { + "id": "generation.label.job", + "name": "Job Label", + "category": "generation", + "status": "stable", + "description": "Generate job title, field, position. Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"job\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "job" + ] + }, + { + "id": "generation.label.relationship", + "name": "Relationship Label", + "category": "generation", + "status": "stable", + "description": "Generate relationship type fields. Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"relationship\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "relationship" + ] + }, + { + "id": "generation.label.weather", + "name": "Weather Label", + "category": "generation", + "status": "stable", + "description": "Generate weather description fields. Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"weather\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "weather" + ] + }, + { + "id": "generation.label.phone", + "name": "Phone Label", + "category": "generation", + "status": "stable", + "description": "Generate phone number fields. Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"phone\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "phone" + ] + }, + { + "id": "generation.label.geo", + "name": "Geo Label", + "category": "generation", + "status": "stable", + "description": "Generate geographic coordinate fields. Used for metadata-driven field generation to automatically select appropriate data generators.", + "subcategory": "labels", + "configuration": [ + { + "name": "label", + "type": "string", + "description": "Set field label to \"geo\" for auto-detection", + "required": false, + "scope": "field", + "scalaConstant": "FIELD_LABEL" + } + ], + "tags": [ + "generation", + "label", + "metadata", + "geo" + ] + }, + { + "id": "validation.field.null", + "name": "Null Check", + "category": "validation", + "status": "stable", + "description": "Validate that a field is null (or not null with negate=true).", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"null\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "null" + ] + }, + { + "id": "validation.field.unique", + "name": "Unique Values", + "category": "validation", + "status": "stable", + "description": "Validate that all values in a field are unique.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"unique\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "unique" + ] + }, + { + "id": "validation.field.equal", + "name": "Equality Check", + "category": "validation", + "status": "stable", + "description": "Validate that field values equal a specified value.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"equal\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "equal" + ] + }, + { + "id": "validation.field.contains", + "name": "Contains Check", + "category": "validation", + "status": "stable", + "description": "Validate that string field values contain a substring.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"contains\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "contains" + ] + }, + { + "id": "validation.field.starts_with", + "name": "Starts With", + "category": "validation", + "status": "stable", + "description": "Validate that string field values start with a prefix.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"startswith\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "starts-with" + ] + }, + { + "id": "validation.field.ends_with", + "name": "Ends With", + "category": "validation", + "status": "stable", + "description": "Validate that string field values end with a suffix.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"endswith\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "ends-with" + ] + }, + { + "id": "validation.field.less_than", + "name": "Less Than", + "category": "validation", + "status": "stable", + "description": "Validate that numeric values are less than a threshold.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"lessthan\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "less-than" + ] + }, + { + "id": "validation.field.greater_than", + "name": "Greater Than", + "category": "validation", + "status": "stable", + "description": "Validate that numeric values are greater than a threshold.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"greaterthan\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "greater-than" + ] + }, + { + "id": "validation.field.between", + "name": "Between Range", + "category": "validation", + "status": "stable", + "description": "Validate that values fall within a min/max range (inclusive).", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"between\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "between" + ] + }, + { + "id": "validation.field.in", + "name": "In Set", + "category": "validation", + "status": "stable", + "description": "Validate that values exist in a specified set of allowed values.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"in\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "in" + ] + }, + { + "id": "validation.field.matches", + "name": "Regex Match", + "category": "validation", + "status": "stable", + "description": "Validate that string values match a regular expression pattern.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"matches\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "matches" + ] + }, + { + "id": "validation.field.matches_list", + "name": "Regex Match List", + "category": "validation", + "status": "stable", + "description": "Validate that string values match one of multiple regex patterns.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"matcheslist\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "matches-list" + ] + }, + { + "id": "validation.field.size", + "name": "Size Check", + "category": "validation", + "status": "stable", + "description": "Validate the size/length of a collection or string field.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"size\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "size" + ] + }, + { + "id": "validation.field.less_than_size", + "name": "Less Than Size", + "category": "validation", + "status": "stable", + "description": "Validate that collection size is less than a threshold.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"lessthansize\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "less-than-size" + ] + }, + { + "id": "validation.field.greater_than_size", + "name": "Greater Than Size", + "category": "validation", + "status": "stable", + "description": "Validate that collection size is greater than a threshold.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"greaterthansize\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "greater-than-size" + ] + }, + { + "id": "validation.field.length_between", + "name": "Length Between", + "category": "validation", + "status": "stable", + "description": "Validate that string length falls within a range.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"lengthbetween\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "length-between" + ] + }, + { + "id": "validation.field.length_equal", + "name": "Length Equal", + "category": "validation", + "status": "stable", + "description": "Validate that string length equals a specific value.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"lengthequal\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "length-equal" + ] + }, + { + "id": "validation.field.luhn_check", + "name": "Luhn Check", + "category": "validation", + "status": "stable", + "description": "Validate values using the Luhn algorithm (credit cards, IDs).", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"luhncheck\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "luhn-check" + ] + }, + { + "id": "validation.field.has_type", + "name": "Type Check", + "category": "validation", + "status": "stable", + "description": "Validate that field values are of a specific data type.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"hastype\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "has-type" + ] + }, + { + "id": "validation.field.has_types", + "name": "Multi-Type Check", + "category": "validation", + "status": "stable", + "description": "Validate that field values match one of multiple types.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"hastypes\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "has-types" + ] + }, + { + "id": "validation.field.is_decreasing", + "name": "Monotonically Decreasing", + "category": "validation", + "status": "stable", + "description": "Validate that values are in decreasing order.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"isdecreasing\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "is-decreasing" + ] + }, + { + "id": "validation.field.is_increasing", + "name": "Monotonically Increasing", + "category": "validation", + "status": "stable", + "description": "Validate that values are in increasing order.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"isincreasing\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "is-increasing" + ] + }, + { + "id": "validation.field.is_json_parsable", + "name": "JSON Parsable", + "category": "validation", + "status": "stable", + "description": "Validate that string values are valid JSON.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"isjsonparsable\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "is-json-parsable" + ] + }, + { + "id": "validation.field.match_json_schema", + "name": "JSON Schema Match", + "category": "validation", + "status": "stable", + "description": "Validate that JSON values conform to a JSON schema.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"matchjsonschema\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "match-json-schema" + ] + }, + { + "id": "validation.field.match_date_time_format", + "name": "DateTime Format Match", + "category": "validation", + "status": "stable", + "description": "Validate that values match a specific date/time format.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"matchdatetimeformat\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "match-date-time-format" + ] + }, + { + "id": "validation.field.distinct_in_set", + "name": "Distinct In Set", + "category": "validation", + "status": "stable", + "description": "Validate that all distinct values exist in a specified set.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"distinctinset\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "distinct-in-set" + ] + }, + { + "id": "validation.field.distinct_contains_set", + "name": "Distinct Contains Set", + "category": "validation", + "status": "stable", + "description": "Validate that distinct values contain all values from a specified set.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"distinctcontainsset\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "distinct-contains-set" + ] + }, + { + "id": "validation.field.distinct_equal", + "name": "Distinct Equal", + "category": "validation", + "status": "stable", + "description": "Validate that the set of distinct values exactly equals a specified set.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"distinctequal\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "distinct-equal" + ] + }, + { + "id": "validation.field.most_common_value_in_set", + "name": "Most Common Value", + "category": "validation", + "status": "stable", + "description": "Validate that the most common value is in a specified set.", + "subcategory": "field_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type to \"mostcommonvalueinset\"", + "required": false, + "scope": "field" + }, + { + "name": "negate", + "type": "boolean", + "description": "Invert the validation result", + "required": false, + "scope": "field", + "default": false + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "field" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "field", + "most-common-value-in-set" + ] + }, + { + "id": "validation.statistical.max_between", + "name": "Max Between", + "category": "validation", + "status": "stable", + "description": "Validate that the maximum value of a field falls within a range.", + "subcategory": "statistical_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type", + "required": false, + "scope": "field" + }, + { + "name": "min", + "type": "any", + "description": "Minimum expected value", + "required": false, + "scope": "field" + }, + { + "name": "max", + "type": "any", + "description": "Maximum expected value", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "statistical", + "max-between" + ] + }, + { + "id": "validation.statistical.mean_between", + "name": "Mean Between", + "category": "validation", + "status": "stable", + "description": "Validate that the mean value of a field falls within a range.", + "subcategory": "statistical_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type", + "required": false, + "scope": "field" + }, + { + "name": "min", + "type": "any", + "description": "Minimum expected value", + "required": false, + "scope": "field" + }, + { + "name": "max", + "type": "any", + "description": "Maximum expected value", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "statistical", + "mean-between" + ] + }, + { + "id": "validation.statistical.median_between", + "name": "Median Between", + "category": "validation", + "status": "stable", + "description": "Validate that the median value of a field falls within a range.", + "subcategory": "statistical_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type", + "required": false, + "scope": "field" + }, + { + "name": "min", + "type": "any", + "description": "Minimum expected value", + "required": false, + "scope": "field" + }, + { + "name": "max", + "type": "any", + "description": "Maximum expected value", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "statistical", + "median-between" + ] + }, + { + "id": "validation.statistical.min_between", + "name": "Min Between", + "category": "validation", + "status": "stable", + "description": "Validate that the minimum value of a field falls within a range.", + "subcategory": "statistical_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type", + "required": false, + "scope": "field" + }, + { + "name": "min", + "type": "any", + "description": "Minimum expected value", + "required": false, + "scope": "field" + }, + { + "name": "max", + "type": "any", + "description": "Maximum expected value", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "statistical", + "min-between" + ] + }, + { + "id": "validation.statistical.std_dev_between", + "name": "Std Dev Between", + "category": "validation", + "status": "stable", + "description": "Validate that the standard deviation of a field falls within a range.", + "subcategory": "statistical_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type", + "required": false, + "scope": "field" + }, + { + "name": "min", + "type": "any", + "description": "Minimum expected value", + "required": false, + "scope": "field" + }, + { + "name": "max", + "type": "any", + "description": "Maximum expected value", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "statistical", + "std-dev-between" + ] + }, + { + "id": "validation.statistical.sum_between", + "name": "Sum Between", + "category": "validation", + "status": "stable", + "description": "Validate that the sum of a field falls within a range.", + "subcategory": "statistical_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type", + "required": false, + "scope": "field" + }, + { + "name": "min", + "type": "any", + "description": "Minimum expected value", + "required": false, + "scope": "field" + }, + { + "name": "max", + "type": "any", + "description": "Maximum expected value", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "statistical", + "sum-between" + ] + }, + { + "id": "validation.statistical.unique_values_proportion_between", + "name": "Unique Values Proportion", + "category": "validation", + "status": "stable", + "description": "Validate that the proportion of unique values falls within a range.", + "subcategory": "statistical_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type", + "required": false, + "scope": "field" + }, + { + "name": "min", + "type": "any", + "description": "Minimum expected value", + "required": false, + "scope": "field" + }, + { + "name": "max", + "type": "any", + "description": "Maximum expected value", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "statistical", + "unique-values-proportion-between" + ] + }, + { + "id": "validation.statistical.quantile_values_between", + "name": "Quantile Values Between", + "category": "validation", + "status": "stable", + "description": "Validate that quantile values fall within specified ranges.", + "subcategory": "statistical_validations", + "configuration": [ + { + "name": "type", + "type": "string", + "description": "Set validation type", + "required": false, + "scope": "field" + }, + { + "name": "min", + "type": "any", + "description": "Minimum expected value", + "required": false, + "scope": "field" + }, + { + "name": "max", + "type": "any", + "description": "Maximum expected value", + "required": false, + "scope": "field" + } + ], + "tags": [ + "validation", + "statistical", + "quantile-values-between" + ] + }, + { + "id": "validation.expression", + "name": "SQL Expression Validation", + "category": "validation", + "status": "stable", + "description": "Validate data using arbitrary Spark SQL expressions that must evaluate to true. The most flexible validation type.", + "subcategory": "expression_validations", + "configuration": [ + { + "name": "expr", + "type": "string", + "description": "SQL expression that must evaluate to true", + "required": true, + "scope": "step", + "yamlPath": "validations[].expr" + }, + { + "name": "selectExpr", + "type": "array", + "description": "SELECT columns for the expression", + "required": false, + "scope": "step", + "yamlPath": "validations[].selectExpr" + }, + { + "name": "preFilterExpr", + "type": "string", + "description": "SQL filter to apply before validation", + "required": false, + "scope": "step", + "yamlPath": "validations[].preFilterExpr" + }, + { + "name": "description", + "type": "string", + "description": "Human-readable description", + "required": false, + "scope": "step" + }, + { + "name": "errorThreshold", + "type": "double", + "description": "Allowed error rate (0-1)", + "required": false, + "scope": "step" + } + ], + "examples": [ + { + "format": "yaml", + "code": "validations:\n - expr: \"age >= 18 AND age <= 120\"\n description: \"Age must be valid\"", + "title": "Expression validation" + } + ], + "tags": [ + "validation", + "expression", + "sql", + "flexible" + ] + }, + { + "id": "validation.group_by", + "name": "Group By Aggregation Validation", + "category": "validation", + "status": "stable", + "description": "Validate aggregated data grouped by specified fields. Supports sum, avg, min, max, count, and stddev aggregations.", + "subcategory": "aggregation_validations", + "configuration": [ + { + "name": "groupByFields", + "type": "array", + "description": "Fields to group by", + "required": true, + "scope": "step", + "yamlPath": "validations[].groupByFields" + }, + { + "name": "aggField", + "type": "string", + "description": "Field to aggregate", + "required": false, + "scope": "step", + "yamlPath": "validations[].aggField" + }, + { + "name": "aggType", + "type": "enum", + "description": "Aggregation function", + "required": false, + "scope": "step", + "validValues": [ + "sum", + "avg", + "min", + "max", + "count", + "stddev" + ], + "yamlPath": "validations[].aggType" + }, + { + "name": "aggExpr", + "type": "string", + "description": "Custom aggregation expression", + "required": false, + "scope": "step", + "yamlPath": "validations[].aggExpr" + } + ], + "examples": [ + { + "format": "yaml", + "code": "validations:\n - groupByFields: [\"status\"]\n aggField: \"balance\"\n aggType: \"avg\"\n aggExpr: \"avg_balance > 0\"\n description: \"Average balance per status\"", + "title": "Group by validation" + } + ], + "tags": [ + "validation", + "aggregation", + "group-by", + "statistical" + ] + }, + { + "id": "validation.upstream", + "name": "Upstream Cross-Source Validation", + "category": "validation", + "status": "stable", + "description": "Validate data by joining with an upstream data source. Enables cross-system data consistency checks.", + "subcategory": "cross_source_validations", + "configuration": [ + { + "name": "upstreamDataSource", + "type": "string", + "description": "Upstream data source name", + "required": true, + "scope": "step", + "yamlPath": "validations[].upstreamDataSource" + }, + { + "name": "upstreamReadOptions", + "type": "object", + "description": "Read options for upstream source", + "required": false, + "scope": "step", + "yamlPath": "validations[].upstreamReadOptions" + }, + { + "name": "joinFields", + "type": "array", + "description": "Fields to join on", + "required": false, + "scope": "step", + "yamlPath": "validations[].joinFields" + }, + { + "name": "joinType", + "type": "enum", + "description": "Join type", + "required": false, + "scope": "step", + "default": "outer", + "validValues": [ + "inner", + "left", + "right", + "full", + "anti", + "semi" + ], + "yamlPath": "validations[].joinType" + } + ], + "examples": [ + { + "format": "yaml", + "code": "validations:\n - upstreamDataSource: \"source_json\"\n joinFields: [\"account_id\"]\n joinType: \"outer\"\n validations:\n - expr: \"source_json_name == name\"", + "title": "Cross-source validation" + } + ], + "tags": [ + "validation", + "upstream", + "cross-source", + "join" + ] + }, + { + "id": "validation.field_names", + "name": "Schema Field Names Validation", + "category": "validation", + "status": "stable", + "description": "Validate the schema structure by checking field/column names, counts, and ordering.", + "subcategory": "schema_validations", + "configuration": [ + { + "name": "names", + "type": "array", + "description": "Expected field names", + "required": false, + "scope": "step", + "yamlPath": "validations[].names" + }, + { + "name": "fieldNameType", + "type": "enum", + "description": "Validation type for field names", + "required": false, + "scope": "step", + "validValues": [ + "fieldCountEqual", + "fieldCountBetween", + "fieldNameMatchOrder", + "fieldNameMatchSet" + ] + }, + { + "name": "count", + "type": "integer", + "description": "Expected exact field count", + "required": false, + "scope": "step" + }, + { + "name": "min", + "type": "integer", + "description": "Minimum field count", + "required": false, + "scope": "step" + }, + { + "name": "max", + "type": "integer", + "description": "Maximum field count", + "required": false, + "scope": "step" + } + ], + "tags": [ + "validation", + "schema", + "field-names", + "structure" + ] + }, + { + "id": "validation.wait_condition", + "name": "Wait Conditions", + "category": "validation", + "status": "stable", + "description": "Define conditions to wait for before running validations. Supports pause, file existence, data existence, and webhook checks.", + "subcategory": "wait_conditions", + "configuration": [ + { + "name": "type", + "type": "enum", + "description": "Wait condition type", + "required": false, + "scope": "step", + "validValues": [ + "pause", + "fileExists", + "dataExists", + "webhook" + ], + "yamlPath": "validations[].waitCondition.type" + }, + { + "name": "pauseInSeconds", + "type": "integer", + "description": "Seconds to pause", + "required": false, + "scope": "step" + }, + { + "name": "path", + "type": "string", + "description": "File path to wait for", + "required": false, + "scope": "step" + }, + { + "name": "url", + "type": "string", + "description": "Webhook URL", + "required": false, + "scope": "step" + }, + { + "name": "method", + "type": "enum", + "description": "HTTP method for webhook", + "required": false, + "scope": "step", + "validValues": [ + "GET", + "POST", + "PUT", + "DELETE" + ] + }, + { + "name": "statusCodes", + "type": "array", + "description": "Expected HTTP status codes", + "required": false, + "scope": "step" + }, + { + "name": "maxRetries", + "type": "integer", + "description": "Maximum retry attempts", + "required": false, + "scope": "step" + }, + { + "name": "waitBeforeRetrySeconds", + "type": "integer", + "description": "Seconds between retries", + "required": false, + "scope": "step" + } + ], + "tags": [ + "validation", + "wait", + "condition", + "async" + ] + }, + { + "id": "configuration.flags.enablecount", + "name": "Count Records", + "category": "configuration", + "status": "stable", + "description": "Count the number of records generated for each data source step.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableCount", + "type": "boolean", + "description": "Count the number of records generated for each data source step.", + "required": false, + "scope": "global", + "default": true, + "yamlPath": "config.flags.enableCount", + "envVar": "ENABLE_COUNT" + } + ], + "tags": [ + "configuration", + "flag", + "count" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enablegeneratedata", + "name": "Generate Data", + "category": "configuration", + "status": "stable", + "description": "Enable or disable data generation. When false, only validation runs.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableGenerateData", + "type": "boolean", + "description": "Enable or disable data generation. When false, only validation runs.", + "required": false, + "scope": "global", + "default": true, + "yamlPath": "config.flags.enableGenerateData", + "envVar": "ENABLE_GENERATE_DATA" + } + ], + "tags": [ + "configuration", + "flag", + "generatedata" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enablerecordtracking", + "name": "Record Tracking", + "category": "configuration", + "status": "stable", + "description": "Track generated records for later cleanup/deletion.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableRecordTracking", + "type": "boolean", + "description": "Track generated records for later cleanup/deletion.", + "required": false, + "scope": "global", + "default": false, + "yamlPath": "config.flags.enableRecordTracking", + "envVar": "ENABLE_RECORD_TRACKING" + } + ], + "tags": [ + "configuration", + "flag", + "recordtracking" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enabledeletegeneratedrecords", + "name": "Delete Generated Records", + "category": "configuration", + "status": "stable", + "description": "Enable cleanup mode to delete previously generated records.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableDeleteGeneratedRecords", + "type": "boolean", + "description": "Enable cleanup mode to delete previously generated records.", + "required": false, + "scope": "global", + "default": false, + "yamlPath": "config.flags.enableDeleteGeneratedRecords", + "envVar": "ENABLE_DELETE_GENERATED_RECORDS" + } + ], + "tags": [ + "configuration", + "flag", + "deletegeneratedrecords" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enablegenerateplanandtasks", + "name": "Auto-Generate Plan and Tasks", + "category": "configuration", + "status": "stable", + "description": "Automatically generate plan and tasks from metadata sources.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableGeneratePlanAndTasks", + "type": "boolean", + "description": "Automatically generate plan and tasks from metadata sources.", + "required": false, + "scope": "global", + "default": false, + "yamlPath": "config.flags.enableGeneratePlanAndTasks", + "envVar": "ENABLE_GENERATE_PLAN_AND_TASKS" + } + ], + "tags": [ + "configuration", + "flag", + "generateplanandtasks" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enablefailonerror", + "name": "Fail on Error", + "category": "configuration", + "status": "stable", + "description": "Fail execution immediately when errors occur.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableFailOnError", + "type": "boolean", + "description": "Fail execution immediately when errors occur.", + "required": false, + "scope": "global", + "default": true, + "yamlPath": "config.flags.enableFailOnError", + "envVar": "ENABLE_FAIL_ON_ERROR" + } + ], + "tags": [ + "configuration", + "flag", + "failonerror" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enableuniquecheck", + "name": "Unique Check", + "category": "configuration", + "status": "stable", + "description": "Validate uniqueness constraints during data generation.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableUniqueCheck", + "type": "boolean", + "description": "Validate uniqueness constraints during data generation.", + "required": false, + "scope": "global", + "default": false, + "yamlPath": "config.flags.enableUniqueCheck", + "envVar": "ENABLE_UNIQUE_CHECK" + } + ], + "tags": [ + "configuration", + "flag", + "uniquecheck" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enablesinkmetadata", + "name": "Sink Metadata", + "category": "configuration", + "status": "stable", + "description": "Save metadata about generated data to the sink.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableSinkMetadata", + "type": "boolean", + "description": "Save metadata about generated data to the sink.", + "required": false, + "scope": "global", + "default": false, + "yamlPath": "config.flags.enableSinkMetadata", + "envVar": "ENABLE_SINK_METADATA" + } + ], + "tags": [ + "configuration", + "flag", + "sinkmetadata" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enablesavereports", + "name": "Save Reports", + "category": "configuration", + "status": "stable", + "description": "Generate and save execution reports with generation and validation results.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableSaveReports", + "type": "boolean", + "description": "Generate and save execution reports with generation and validation results.", + "required": false, + "scope": "global", + "default": true, + "yamlPath": "config.flags.enableSaveReports", + "envVar": "ENABLE_SAVE_REPORTS" + } + ], + "tags": [ + "configuration", + "flag", + "savereports" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enablevalidation", + "name": "Data Validation", + "category": "configuration", + "status": "stable", + "description": "Run data validations after generation completes.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableValidation", + "type": "boolean", + "description": "Run data validations after generation completes.", + "required": false, + "scope": "global", + "default": true, + "yamlPath": "config.flags.enableValidation", + "envVar": "ENABLE_VALIDATION" + } + ], + "tags": [ + "configuration", + "flag", + "validation" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enablegeneratevalidations", + "name": "Suggest Validations", + "category": "configuration", + "status": "stable", + "description": "Auto-suggest validations based on data analysis.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableGenerateValidations", + "type": "boolean", + "description": "Auto-suggest validations based on data analysis.", + "required": false, + "scope": "global", + "default": false, + "yamlPath": "config.flags.enableGenerateValidations", + "envVar": "ENABLE_SUGGEST_VALIDATIONS" + } + ], + "tags": [ + "configuration", + "flag", + "generatevalidations" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enablealerts", + "name": "Alerts", + "category": "configuration", + "status": "stable", + "description": "Send alert notifications on completion (supports Slack).", + "subcategory": "flags", + "configuration": [ + { + "name": "enableAlerts", + "type": "boolean", + "description": "Send alert notifications on completion (supports Slack).", + "required": false, + "scope": "global", + "default": true, + "yamlPath": "config.flags.enableAlerts", + "envVar": "ENABLE_ALERTS" + } + ], + "tags": [ + "configuration", + "flag", + "alerts" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enableuniquecheckonlyinbatch", + "name": "Unique Check Only In Batch", + "category": "configuration", + "status": "stable", + "description": "Check uniqueness only within the current batch for better performance.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableUniqueCheckOnlyInBatch", + "type": "boolean", + "description": "Check uniqueness only within the current batch for better performance.", + "required": false, + "scope": "global", + "default": false, + "yamlPath": "config.flags.enableUniqueCheckOnlyInBatch", + "envVar": "ENABLE_UNIQUE_CHECK_ONLY_WITHIN_BATCH" + } + ], + "tags": [ + "configuration", + "flag", + "uniquecheckonlyinbatch" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.flags.enablefastgeneration", + "name": "Fast Generation", + "category": "configuration", + "status": "stable", + "description": "Use SQL-based generation for regex patterns instead of UDFs. Dramatically improves performance.", + "subcategory": "flags", + "configuration": [ + { + "name": "enableFastGeneration", + "type": "boolean", + "description": "Use SQL-based generation for regex patterns instead of UDFs. Dramatically improves performance.", + "required": false, + "scope": "global", + "default": false, + "yamlPath": "config.flags.enableFastGeneration", + "envVar": "ENABLE_FAST_GENERATION" + } + ], + "tags": [ + "configuration", + "flag", + "fastgeneration" + ], + "sourceFiles": [ + { + "path": "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala", + "role": "primary" + } + ] + }, + { + "id": "configuration.folders.planfilepath", + "name": "plan File", + "category": "configuration", + "status": "stable", + "description": "Configuration path for planFilePath.", + "subcategory": "folders", + "configuration": [ + { + "name": "planFilePath", + "type": "string", + "description": "Path setting for planFilePath", + "required": false, + "scope": "global", + "yamlPath": "config.folders.planFilePath" + } + ], + "tags": [ + "configuration", + "folder", + "path" + ] + }, + { + "id": "configuration.folders.taskfolderpath", + "name": "task Folder", + "category": "configuration", + "status": "stable", + "description": "Configuration path for taskFolderPath.", + "subcategory": "folders", + "configuration": [ + { + "name": "taskFolderPath", + "type": "string", + "description": "Path setting for taskFolderPath", + "required": false, + "scope": "global", + "yamlPath": "config.folders.taskFolderPath" + } + ], + "tags": [ + "configuration", + "folder", + "path" + ] + }, + { + "id": "configuration.folders.generatedplanandtaskfolderpath", + "name": "generatedPlanAndTask Folder", + "category": "configuration", + "status": "stable", + "description": "Configuration path for generatedPlanAndTaskFolderPath.", + "subcategory": "folders", + "configuration": [ + { + "name": "generatedPlanAndTaskFolderPath", + "type": "string", + "description": "Path setting for generatedPlanAndTaskFolderPath", + "required": false, + "scope": "global", + "yamlPath": "config.folders.generatedPlanAndTaskFolderPath" + } + ], + "tags": [ + "configuration", + "folder", + "path" + ] + }, + { + "id": "configuration.folders.generatedreportsfolderpath", + "name": "generatedReports Folder", + "category": "configuration", + "status": "stable", + "description": "Configuration path for generatedReportsFolderPath.", + "subcategory": "folders", + "configuration": [ + { + "name": "generatedReportsFolderPath", + "type": "string", + "description": "Path setting for generatedReportsFolderPath", + "required": false, + "scope": "global", + "yamlPath": "config.folders.generatedReportsFolderPath" + } + ], + "tags": [ + "configuration", + "folder", + "path" + ] + }, + { + "id": "configuration.folders.recordtrackingfolderpath", + "name": "recordTracking Folder", + "category": "configuration", + "status": "stable", + "description": "Configuration path for recordTrackingFolderPath.", + "subcategory": "folders", + "configuration": [ + { + "name": "recordTrackingFolderPath", + "type": "string", + "description": "Path setting for recordTrackingFolderPath", + "required": false, + "scope": "global", + "yamlPath": "config.folders.recordTrackingFolderPath" + } + ], + "tags": [ + "configuration", + "folder", + "path" + ] + }, + { + "id": "configuration.folders.validationfolderpath", + "name": "validation Folder", + "category": "configuration", + "status": "stable", + "description": "Configuration path for validationFolderPath.", + "subcategory": "folders", + "configuration": [ + { + "name": "validationFolderPath", + "type": "string", + "description": "Path setting for validationFolderPath", + "required": false, + "scope": "global", + "yamlPath": "config.folders.validationFolderPath" + } + ], + "tags": [ + "configuration", + "folder", + "path" + ] + }, + { + "id": "configuration.folders.recordtrackingforvalidationfolderpath", + "name": "recordTrackingForValidation Folder", + "category": "configuration", + "status": "stable", + "description": "Configuration path for recordTrackingForValidationFolderPath.", + "subcategory": "folders", + "configuration": [ + { + "name": "recordTrackingForValidationFolderPath", + "type": "string", + "description": "Path setting for recordTrackingForValidationFolderPath", + "required": false, + "scope": "global", + "yamlPath": "config.folders.recordTrackingForValidationFolderPath" + } + ], + "tags": [ + "configuration", + "folder", + "path" + ] + }, + { + "id": "configuration.generation.batch_size", + "name": "Batch Size", + "category": "configuration", + "status": "stable", + "description": "Control the number of records generated per batch. Affects memory usage and performance.", + "subcategory": "generation", + "configuration": [ + { + "name": "numRecordsPerBatch", + "type": "long", + "description": "Records per batch", + "required": false, + "scope": "global", + "default": 100000, + "yamlPath": "config.generation.numRecordsPerBatch" + }, + { + "name": "numRecordsPerStep", + "type": "long", + "description": "Default records per step/table", + "required": false, + "scope": "global", + "yamlPath": "config.generation.numRecordsPerStep" + } + ], + "tags": [ + "configuration", + "generation", + "batch", + "performance" + ] + }, + { + "id": "configuration.generation.bloom_filter", + "name": "Bloom Filter Configuration", + "category": "configuration", + "status": "stable", + "description": "Configure bloom filter parameters for uniqueness checking during generation.", + "subcategory": "generation", + "configuration": [ + { + "name": "uniqueBloomFilterNumItems", + "type": "long", + "description": "Expected number of items in bloom filter", + "required": false, + "scope": "global", + "default": 10000000, + "yamlPath": "config.generation.uniqueBloomFilterNumItems" + }, + { + "name": "uniqueBloomFilterFalsePositiveProbability", + "type": "double", + "description": "Bloom filter false positive rate (0-1)", + "required": false, + "scope": "global", + "default": 0.01, + "range": { + "min": 0, + "max": 1 + }, + "yamlPath": "config.generation.uniqueBloomFilterFalsePositiveProbability" + } + ], + "tags": [ + "configuration", + "generation", + "bloom-filter", + "uniqueness" + ] + }, + { + "id": "configuration.metadata", + "name": "Metadata Analysis Configuration", + "category": "configuration", + "status": "stable", + "description": "Configure how metadata is sampled and analyzed for auto-generation of field patterns.", + "subcategory": "metadata", + "configuration": [ + { + "name": "numRecordsFromDataSource", + "type": "integer", + "description": "Sample size from data source", + "required": false, + "scope": "global", + "default": 10000, + "yamlPath": "config.metadata.numRecordsFromDataSource" + }, + { + "name": "numRecordsForAnalysis", + "type": "integer", + "description": "Records analyzed for pattern detection", + "required": false, + "scope": "global", + "default": 10000, + "yamlPath": "config.metadata.numRecordsForAnalysis" + }, + { + "name": "oneOfDistinctCountVsCountThreshold", + "type": "double", + "description": "Threshold for detecting oneOf fields", + "required": false, + "scope": "global", + "default": 0.2, + "yamlPath": "config.metadata.oneOfDistinctCountVsCountThreshold" + }, + { + "name": "oneOfMinCount", + "type": "long", + "description": "Minimum records for oneOf detection", + "required": false, + "scope": "global", + "default": 1000, + "yamlPath": "config.metadata.oneOfMinCount" + }, + { + "name": "numGeneratedSamples", + "type": "integer", + "description": "Number of sample records in metadata suggestions", + "required": false, + "scope": "global", + "default": 10, + "yamlPath": "config.metadata.numGeneratedSamples" + } + ], + "tags": [ + "configuration", + "metadata", + "analysis", + "sampling" + ] + }, + { + "id": "configuration.streaming", + "name": "Streaming Configuration", + "category": "configuration", + "status": "stable", + "description": "Configure streaming/real-time data generation parameters.", + "subcategory": "streaming", + "configuration": [ + { + "name": "maxTimeoutSeconds", + "type": "integer", + "description": "Maximum streaming timeout", + "required": false, + "scope": "global", + "default": 3600 + }, + { + "name": "maxAsyncParallelism", + "type": "integer", + "description": "Maximum async parallelism", + "required": false, + "scope": "global", + "default": 100 + }, + { + "name": "responseBufferSize", + "type": "integer", + "description": "Response buffer size for streaming", + "required": false, + "scope": "global", + "default": 10000 + }, + { + "name": "timestampWindowMs", + "type": "long", + "description": "Timestamp window in milliseconds", + "required": false, + "scope": "global", + "default": 1000 + } + ], + "tags": [ + "configuration", + "streaming", + "real-time", + "performance" + ] + }, + { + "id": "configuration.alerts", + "name": "Alert Configuration", + "category": "configuration", + "status": "stable", + "description": "Configure alert notifications triggered on execution completion. Supports Slack integration.", + "subcategory": "alerts", + "configuration": [ + { + "name": "triggerOn", + "type": "enum", + "description": "When to trigger alerts", + "required": false, + "scope": "global", + "default": "all", + "validValues": [ + "all", + "failure", + "success", + "generation_failure", + "validation_failure", + "generation_success", + "validation_success" + ], + "yamlPath": "config.alert.triggerOn" + }, + { + "name": "slackToken", + "type": "string", + "description": "Slack API token", + "required": false, + "scope": "global", + "yamlPath": "config.alert.slackToken" + }, + { + "name": "slackChannels", + "type": "array", + "description": "Slack channels to notify", + "required": false, + "scope": "global", + "yamlPath": "config.alert.slackChannels" + } + ], + "tags": [ + "configuration", + "alert", + "notification", + "slack" + ] + }, + { + "id": "configuration.validation", + "name": "Validation Runtime Configuration", + "category": "configuration", + "status": "stable", + "description": "Configure validation execution behavior.", + "subcategory": "validation_runtime", + "configuration": [ + { + "name": "numSampleErrorRecords", + "type": "integer", + "description": "Number of sample error records in reports", + "required": false, + "scope": "global", + "default": 5, + "yamlPath": "config.validation.numSampleErrorRecords" + }, + { + "name": "enableDeleteRecordTrackingFiles", + "type": "boolean", + "description": "Delete tracking files after validation", + "required": false, + "scope": "global", + "default": true, + "yamlPath": "config.validation.enableDeleteRecordTrackingFiles" + } + ], + "tags": [ + "configuration", + "validation", + "runtime" + ] + }, + { + "id": "configuration.runtime.spark", + "name": "Apache Spark Configuration", + "category": "configuration", + "status": "stable", + "description": "Configure the Apache Spark runtime for data processing. Set master URL, driver/executor memory, and Spark SQL settings.", + "subcategory": "runtime", + "configuration": [ + { + "name": "master", + "type": "string", + "description": "Spark master URL", + "required": false, + "scope": "global", + "default": "local[*]", + "yamlPath": "config.runtime.master" + }, + { + "name": "sparkConfig", + "type": "object", + "description": "Spark configuration key-value pairs", + "required": false, + "scope": "global", + "yamlPath": "config.runtime.sparkConfig" + } + ], + "examples": [ + { + "format": "yaml", + "code": "config:\n runtime:\n master: \"local[4]\"\n sparkConfig:\n \"spark.driver.memory\": \"4g\"\n \"spark.sql.shuffle.partitions\": \"10\"", + "title": "Spark configuration" + } + ], + "tags": [ + "configuration", + "runtime", + "spark", + "performance" + ] + }, + { + "id": "configuration.sink_options", + "name": "Global Sink Options", + "category": "configuration", + "status": "stable", + "description": "Global options for data output: random seed for reproducibility and locale for data generation.", + "subcategory": "sink", + "configuration": [ + { + "name": "seed", + "type": "string", + "description": "Random seed for reproducible generation", + "required": false, + "scope": "global", + "yamlPath": "sinkOptions.seed" + }, + { + "name": "locale", + "type": "string", + "description": "Locale for data generation (affects names, addresses)", + "required": false, + "scope": "global", + "yamlPath": "sinkOptions.locale" + } + ], + "examples": [ + { + "format": "yaml", + "code": "sinkOptions:\n seed: \"42\"\n locale: \"en-US\"", + "title": "Sink options" + } + ], + "tags": [ + "configuration", + "sink", + "seed", + "locale" + ] + }, + { + "id": "advanced.foreign_keys", + "name": "Foreign Key Relationships", + "category": "advanced", + "status": "stable", + "description": "Define foreign key relationships between data sources to maintain referential integrity. Supports composite keys, cardinality control, nullability, and multiple generation modes.", + "subcategory": "referential_integrity", + "configuration": [ + { + "name": "source", + "type": "object", + "description": "Source table containing primary key", + "required": true, + "scope": "plan", + "yamlPath": "foreignKeys[].source" + }, + { + "name": "generate", + "type": "array", + "description": "Target tables with foreign key references", + "required": false, + "scope": "plan", + "yamlPath": "foreignKeys[].generate" + }, + { + "name": "delete", + "type": "array", + "description": "Target tables for cleanup", + "required": false, + "scope": "plan", + "yamlPath": "foreignKeys[].delete" + } + ], + "examples": [ + { + "format": "yaml", + "code": "foreignKeys:\n - source:\n dataSource: postgres_db\n step: customers\n fields: [\"customer_id\"]\n generate:\n - dataSource: postgres_db\n step: orders\n fields: [\"customer_id\"]\n cardinality:\n min: 1\n max: 10\n distribution: \"uniform\"", + "title": "Foreign key with cardinality" + } + ], + "tags": [ + "advanced", + "foreign-key", + "referential-integrity", + "relationship" + ] + }, + { + "id": "advanced.foreign_key_cardinality", + "name": "Foreign Key Cardinality Control", + "category": "advanced", + "status": "stable", + "description": "Control the cardinality of foreign key relationships. Set min/max records per parent, ratio multipliers, and distribution patterns.", + "subcategory": "referential_integrity", + "configuration": [ + { + "name": "min", + "type": "integer", + "description": "Minimum records per parent key", + "required": false, + "scope": "plan", + "yamlPath": "foreignKeys[].generate[].cardinality.min" + }, + { + "name": "max", + "type": "integer", + "description": "Maximum records per parent key", + "required": false, + "scope": "plan", + "yamlPath": "foreignKeys[].generate[].cardinality.max" + }, + { + "name": "ratio", + "type": "double", + "description": "Ratio multiplier (e.g., 10.0 = 10x parent records)", + "required": false, + "scope": "plan", + "yamlPath": "foreignKeys[].generate[].cardinality.ratio" + }, + { + "name": "distribution", + "type": "enum", + "description": "Cardinality distribution", + "required": false, + "scope": "plan", + "default": "uniform", + "validValues": [ + "uniform", + "normal", + "zipf", + "power" + ], + "yamlPath": "foreignKeys[].generate[].cardinality.distribution" + } + ], + "tags": [ + "advanced", + "cardinality", + "distribution", + "foreign-key" + ] + }, + { + "id": "advanced.foreign_key_nullability", + "name": "Foreign Key Nullability", + "category": "advanced", + "status": "stable", + "description": "Control null value injection in foreign key fields. Configure percentage of nulls and distribution strategy (random, head, tail).", + "subcategory": "referential_integrity", + "configuration": [ + { + "name": "nullPercentage", + "type": "double", + "description": "Percentage of null values (0-1)", + "required": false, + "scope": "plan", + "range": { + "min": 0, + "max": 1 + }, + "yamlPath": "foreignKeys[].generate[].nullability.nullPercentage" + }, + { + "name": "strategy", + "type": "enum", + "description": "Null distribution strategy", + "required": false, + "scope": "plan", + "default": "random", + "validValues": [ + "random", + "leading", + "trailing" + ], + "yamlPath": "foreignKeys[].generate[].nullability.strategy" + } + ], + "tags": [ + "advanced", + "nullability", + "foreign-key", + "null" + ] + }, + { + "id": "advanced.foreign_key_generation_modes", + "name": "Foreign Key Generation Modes", + "category": "advanced", + "status": "stable", + "description": "Control how foreign key values are generated. \"all-exist\" ensures all records have valid FKs, \"all-combinations\" generates all possible combinations, \"partial\" creates a mix of valid and invalid references.", + "subcategory": "referential_integrity", + "configuration": [ + { + "name": "generationMode", + "type": "enum", + "description": "FK generation strategy", + "required": false, + "scope": "plan", + "default": "all-exist", + "validValues": [ + "all-exist", + "all-combinations", + "partial" + ] + } + ], + "tags": [ + "advanced", + "foreign-key", + "generation-mode" + ], + "useCases": [ + "all-exist: Standard referential integrity testing", + "all-combinations: Comprehensive join testing with all possible combinations", + "partial: Testing handling of orphan records and broken references" + ] + }, + { + "id": "advanced.count", + "name": "Record Count Configuration", + "category": "advanced", + "status": "stable", + "description": "Configure how many records to generate per step. Supports fixed count, per-field distribution, and streaming rate-based generation.", + "subcategory": "count", + "configuration": [ + { + "name": "records", + "type": "integer", + "description": "Total records to generate", + "required": false, + "scope": "step", + "default": 1000, + "yamlPath": "dataSources[].steps[].count.records" + }, + { + "name": "perField", + "type": "object", + "description": "Generate records per unique field value", + "required": false, + "scope": "step", + "yamlPath": "dataSources[].steps[].count.perField" + } + ], + "examples": [ + { + "format": "yaml", + "code": "count:\n records: 5000", + "title": "Fixed count" + }, + { + "format": "yaml", + "code": "count:\n records: 100\n perField:\n fieldNames: [\"account_id\"]\n options:\n min: 1\n max: 5", + "title": "Per-field count distribution" + } + ], + "tags": [ + "advanced", + "count", + "records", + "distribution" + ] + }, + { + "id": "advanced.streaming_load_patterns", + "name": "Streaming Load Patterns", + "category": "advanced", + "status": "stable", + "description": "Define time-based data generation patterns for streaming scenarios. Supports ramp, spike, sine, and custom step patterns.", + "subcategory": "streaming", + "configuration": [ + { + "name": "duration", + "type": "string", + "description": "Streaming duration (e.g., 10m, 1h)", + "required": false, + "scope": "step", + "yamlPath": "dataSources[].steps[].count.duration" + }, + { + "name": "rate", + "type": "integer", + "description": "Records per time unit", + "required": false, + "scope": "step", + "yamlPath": "dataSources[].steps[].count.rate" + }, + { + "name": "rateUnit", + "type": "enum", + "description": "Time unit for rate", + "required": false, + "scope": "step", + "validValues": [ + "second", + "minute", + "hour" + ], + "yamlPath": "dataSources[].steps[].count.rateUnit" + }, + { + "name": "pattern.type", + "type": "enum", + "description": "Load pattern type", + "required": false, + "scope": "step", + "validValues": [ + "ramp", + "spike", + "sine", + "steps" + ], + "yamlPath": "dataSources[].steps[].count.pattern.type" + }, + { + "name": "pattern.startRate", + "type": "integer", + "description": "Starting rate for ramp pattern", + "required": false, + "scope": "step" + }, + { + "name": "pattern.endRate", + "type": "integer", + "description": "Ending rate for ramp pattern", + "required": false, + "scope": "step" + }, + { + "name": "pattern.baseRate", + "type": "integer", + "description": "Base rate for spike pattern", + "required": false, + "scope": "step" + }, + { + "name": "pattern.spikeRate", + "type": "integer", + "description": "Spike rate", + "required": false, + "scope": "step" + }, + { + "name": "pattern.amplitude", + "type": "integer", + "description": "Amplitude for sine pattern", + "required": false, + "scope": "step" + }, + { + "name": "pattern.frequency", + "type": "double", + "description": "Frequency for sine pattern", + "required": false, + "scope": "step" + }, + { + "name": "pattern.steps", + "type": "array", + "description": "Custom step definitions with rate and duration", + "required": false, + "scope": "step" + } + ], + "examples": [ + { + "format": "yaml", + "code": "count:\n duration: \"1m\"\n rate: 100\n rateUnit: \"second\"\n pattern:\n type: \"ramp\"\n startRate: 10\n endRate: 200", + "title": "Ramp load pattern" + } + ], + "tags": [ + "advanced", + "streaming", + "load-pattern", + "rate" + ] + }, + { + "id": "advanced.transformation", + "name": "Post-Generation Transformation", + "category": "advanced", + "status": "stable", + "description": "Apply custom Java/Scala transformations to generated data before writing to output. Supports whole-file and row-by-row modes.", + "subcategory": "transformation", + "configuration": [ + { + "name": "className", + "type": "string", + "description": "Fully qualified transformation class name", + "required": true, + "scope": "step", + "yamlPath": "dataSources[].steps[].transformation.className" + }, + { + "name": "methodName", + "type": "string", + "description": "Method to call", + "required": false, + "scope": "step", + "default": "transform" + }, + { + "name": "mode", + "type": "enum", + "description": "Transformation mode", + "required": false, + "scope": "step", + "validValues": [ + "whole-file", + "row-by-row" + ] + }, + { + "name": "outputPath", + "type": "string", + "description": "Output directory", + "required": false, + "scope": "step" + }, + { + "name": "deleteOriginal", + "type": "boolean", + "description": "Delete input after transformation", + "required": false, + "scope": "step" + }, + { + "name": "enabled", + "type": "boolean", + "description": "Enable/disable transformation", + "required": false, + "scope": "step", + "default": true + } + ], + "tags": [ + "advanced", + "transformation", + "custom", + "plugin" + ] + }, + { + "id": "advanced.step_options", + "name": "Step Field Filtering", + "category": "advanced", + "status": "stable", + "description": "Include or exclude fields from metadata-driven generation using exact names or patterns.", + "subcategory": "step_options", + "configuration": [ + { + "name": "includeFields", + "type": "array", + "description": "List of field names to include", + "required": false, + "scope": "step", + "scalaConstant": "INCLUDE_FIELDS" + }, + { + "name": "excludeFields", + "type": "array", + "description": "List of field names to exclude", + "required": false, + "scope": "step", + "scalaConstant": "EXCLUDE_FIELDS" + }, + { + "name": "includeFieldPatterns", + "type": "array", + "description": "Regex patterns for fields to include", + "required": false, + "scope": "step", + "scalaConstant": "INCLUDE_FIELD_PATTERNS" + }, + { + "name": "excludeFieldPatterns", + "type": "array", + "description": "Regex patterns for fields to exclude", + "required": false, + "scope": "step", + "scalaConstant": "EXCLUDE_FIELD_PATTERNS" + }, + { + "name": "allCombinations", + "type": "boolean", + "description": "Generate all field value combinations", + "required": false, + "scope": "step", + "scalaConstant": "ALL_COMBINATIONS" + } + ], + "tags": [ + "advanced", + "step", + "filtering", + "metadata" + ] + }, + { + "id": "metadata.source.marquez", + "name": "Marquez Integration", + "category": "metadata", + "status": "stable", + "description": "Apache Marquez open-source metadata service with OpenLineage support.", + "subcategory": "sources", + "tags": [ + "metadata", + "integration", + "marquez" + ] + }, + { + "id": "metadata.source.open_metadata", + "name": "OpenMetadata Integration", + "category": "metadata", + "status": "stable", + "description": "OpenMetadata platform for metadata discovery. Supports multiple auth types (basic, Azure, Google, Okta, Auth0, AWS Cognito).", + "subcategory": "sources", + "tags": [ + "metadata", + "integration", + "open-metadata" + ] + }, + { + "id": "metadata.source.open_api", + "name": "OpenAPI/Swagger Integration", + "category": "metadata", + "status": "stable", + "description": "Generate data from OpenAPI/Swagger specifications.", + "subcategory": "sources", + "tags": [ + "metadata", + "integration", + "open-api" + ] + }, + { + "id": "metadata.source.great_expectations", + "name": "Great Expectations Integration", + "category": "metadata", + "status": "stable", + "description": "Import data quality expectations from Great Expectations suites.", + "subcategory": "sources", + "tags": [ + "metadata", + "integration", + "great-expectations" + ] + }, + { + "id": "metadata.source.open_data_contract_standard", + "name": "Open Data Contract Standard Integration", + "category": "metadata", + "status": "stable", + "description": "Import schemas from ODCS format.", + "subcategory": "sources", + "tags": [ + "metadata", + "integration", + "open-data-contract-standard" + ] + }, + { + "id": "metadata.source.data_contract_cli", + "name": "Data Contract CLI Integration", + "category": "metadata", + "status": "stable", + "description": "Import schemas from Data Contract CLI format.", + "subcategory": "sources", + "tags": [ + "metadata", + "integration", + "data-contract-cli" + ] + }, + { + "id": "metadata.source.amundsen", + "name": "Amundsen Integration", + "category": "metadata", + "status": "stable", + "description": "Import metadata from Amundsen data catalog.", + "subcategory": "sources", + "tags": [ + "metadata", + "integration", + "amundsen" + ] + }, + { + "id": "metadata.source.datahub", + "name": "DataHub Integration", + "category": "metadata", + "status": "stable", + "description": "Import metadata from DataHub data catalog.", + "subcategory": "sources", + "tags": [ + "metadata", + "integration", + "datahub" + ] + }, + { + "id": "metadata.source.confluent_schema_registry", + "name": "Confluent Schema Registry Integration", + "category": "metadata", + "status": "stable", + "description": "Import schemas from Confluent Schema Registry (Avro, Protobuf, JSON Schema).", + "subcategory": "sources", + "tags": [ + "metadata", + "integration", + "confluent-schema-registry" + ] + }, + { + "id": "metadata.source.json_schema", + "name": "JSON Schema Integration", + "category": "metadata", + "status": "stable", + "description": "Generate data from JSON Schema definitions.", + "subcategory": "sources", + "tags": [ + "metadata", + "integration", + "json-schema" + ] + }, + { + "id": "advanced.reference_mode", + "name": "Reference Mode", + "category": "advanced", + "status": "stable", + "description": "Load existing data as reference for foreign key relationships instead of generating new data. Useful when you need realistic FK values from existing datasets.", + "subcategory": "reference", + "configuration": [ + { + "name": "enableReferenceMode", + "type": "boolean", + "description": "Enable reference mode for this data source", + "required": false, + "scope": "datasource", + "default": false + }, + { + "name": "enableDataGeneration", + "type": "boolean", + "description": "Disable generation (use with reference mode)", + "required": false, + "scope": "datasource", + "default": true + } + ], + "tags": [ + "advanced", + "reference", + "existing-data", + "foreign-key" + ] + }, + { + "id": "advanced.interfaces", + "name": "Configuration Interfaces", + "category": "advanced", + "status": "stable", + "description": "Data Caterer supports multiple configuration interfaces: Java API, Scala API, YAML configuration, and Web UI.", + "subcategory": "interfaces", + "tags": [ + "advanced", + "interface", + "api", + "yaml", + "ui" + ], + "useCases": [ + "Java API: Programmatic configuration from Java applications", + "Scala API: Programmatic configuration with Scala builders", + "YAML: Declarative configuration for CI/CD and automation", + "Web UI: Visual configuration and execution management" + ] + }, + { + "id": "advanced.env_substitution", + "name": "Environment Variable Substitution", + "category": "advanced", + "status": "stable", + "description": "Use ${VAR_NAME} syntax in YAML configuration to substitute environment variables at runtime. Supports default values with ${VAR:-default}.", + "subcategory": "configuration", + "examples": [ + { + "format": "yaml", + "code": "options:\n password: \"${DB_PASSWORD}\"\n url: \"${KAFKA_BROKERS:-localhost:9092}\"", + "title": "Environment variable substitution" + } + ], + "tags": [ + "advanced", + "environment", + "variable", + "secrets" + ] + }, + { + "id": "ui.connection_management", + "name": "Connection Management", + "category": "ui_api", + "status": "stable", + "description": "Create, edit, test, and manage data source connections through the web UI.", + "subcategory": "web_ui", + "tags": [ + "ui", + "web", + "connection-management" + ], + "sourceFiles": [ + { + "path": "app/src/main/scala/io/github/datacatering/datacaterer/core/ui/", + "role": "primary" + } + ] + }, + { + "id": "ui.plan_creation", + "name": "Interactive Plan Creation", + "category": "ui_api", + "status": "stable", + "description": "Build data generation plans interactively with visual field configuration.", + "subcategory": "web_ui", + "tags": [ + "ui", + "web", + "plan-creation" + ], + "sourceFiles": [ + { + "path": "app/src/main/scala/io/github/datacatering/datacaterer/core/ui/", + "role": "primary" + } + ] + }, + { + "id": "ui.execution_history", + "name": "Execution History", + "category": "ui_api", + "status": "stable", + "description": "View past execution runs with status, timing, and record counts.", + "subcategory": "web_ui", + "tags": [ + "ui", + "web", + "execution-history" + ], + "sourceFiles": [ + { + "path": "app/src/main/scala/io/github/datacatering/datacaterer/core/ui/", + "role": "primary" + } + ] + }, + { + "id": "ui.results_viewing", + "name": "Real-time Results", + "category": "ui_api", + "status": "stable", + "description": "View generation and validation results in real-time during execution.", + "subcategory": "web_ui", + "tags": [ + "ui", + "web", + "results-viewing" + ], + "sourceFiles": [ + { + "path": "app/src/main/scala/io/github/datacatering/datacaterer/core/ui/", + "role": "primary" + } + ] + }, + { + "id": "ui.sample_data", + "name": "Sample Data Generation", + "category": "ui_api", + "status": "stable", + "description": "Preview generated sample data before running full generation.", + "subcategory": "web_ui", + "tags": [ + "ui", + "web", + "sample-data" + ], + "sourceFiles": [ + { + "path": "app/src/main/scala/io/github/datacatering/datacaterer/core/ui/", + "role": "primary" + } + ] + }, + { + "id": "ui.report_generation", + "name": "Report Generation", + "category": "ui_api", + "status": "stable", + "description": "Generate detailed HTML reports with generation statistics and validation results.", + "subcategory": "web_ui", + "tags": [ + "ui", + "web", + "report-generation" + ], + "sourceFiles": [ + { + "path": "app/src/main/scala/io/github/datacatering/datacaterer/core/ui/", + "role": "primary" + } + ] + } + ] +} \ No newline at end of file diff --git a/misc/feature-catalog/schema/feature-metadata-schema.json b/misc/feature-catalog/schema/feature-metadata-schema.json new file mode 100644 index 00000000..9bc1dc67 --- /dev/null +++ b/misc/feature-catalog/schema/feature-metadata-schema.json @@ -0,0 +1,175 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "https://github.com/data-catering/data-caterer/feature-catalog/schema", + "title": "Feature Catalog Schema", + "description": "Generic schema for documenting software features. Designed to be reusable across projects.", + "type": "object", + "properties": { + "project": { + "type": "object", + "description": "Project metadata", + "properties": { + "name": { "type": "string" }, + "version": { "type": "string" }, + "repository": { "type": "string" }, + "lastUpdated": { "type": "string", "format": "date-time" } + }, + "required": ["name", "version"] + }, + "categories": { + "type": "array", + "items": { "$ref": "#/$defs/category" } + }, + "features": { + "type": "array", + "items": { "$ref": "#/$defs/feature" } + } + }, + "required": ["project", "features"], + "$defs": { + "category": { + "type": "object", + "properties": { + "id": { "type": "string", "description": "Unique category identifier" }, + "name": { "type": "string", "description": "Human-readable name" }, + "description": { "type": "string" }, + "featureCount": { "type": "integer" } + }, + "required": ["id", "name"] + }, + "feature": { + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "Unique feature identifier using dot notation (e.g., 'generation.field.regex')", + "pattern": "^[a-z][a-z0-9_]*(\\.[a-z][a-z0-9_]*)*$" + }, + "name": { + "type": "string", + "description": "Human-readable feature name" + }, + "category": { + "type": "string", + "description": "Category this feature belongs to" + }, + "subcategory": { + "type": "string", + "description": "Subcategory for finer grouping" + }, + "status": { + "type": "string", + "enum": ["stable", "experimental", "deprecated", "planned"], + "description": "Implementation status" + }, + "description": { + "type": "string", + "description": "Detailed description of the feature" + }, + "useCases": { + "type": "array", + "items": { "type": "string" }, + "description": "Common use cases for this feature" + }, + "configuration": { + "type": "array", + "items": { "$ref": "#/$defs/configOption" }, + "description": "Configuration options that control this feature" + }, + "sourceFiles": { + "type": "array", + "items": { "$ref": "#/$defs/sourceFile" }, + "description": "Source code locations implementing this feature" + }, + "dependencies": { + "type": "object", + "properties": { + "features": { + "type": "array", + "items": { "type": "string" }, + "description": "Other feature IDs this feature depends on" + }, + "libraries": { + "type": "array", + "items": { "type": "string" }, + "description": "External library dependencies" + } + } + }, + "examples": { + "type": "array", + "items": { "$ref": "#/$defs/example" }, + "description": "Usage examples" + }, + "relatedFeatures": { + "type": "array", + "items": { "type": "string" }, + "description": "IDs of related features" + }, + "tags": { + "type": "array", + "items": { "type": "string" }, + "description": "Searchable tags" + }, + "limitations": { + "type": "array", + "items": { "type": "string" }, + "description": "Known limitations" + }, + "performanceNotes": { + "type": "array", + "items": { "type": "string" }, + "description": "Performance considerations" + } + }, + "required": ["id", "name", "category", "status", "description"] + }, + "configOption": { + "type": "object", + "properties": { + "name": { "type": "string", "description": "Option name as used in configuration" }, + "type": { "type": "string", "enum": ["boolean", "string", "integer", "long", "double", "enum", "array", "object", "any"] }, + "required": { "type": "boolean", "default": false }, + "default": { "description": "Default value" }, + "description": { "type": "string" }, + "validValues": { "type": "array", "items": {}, "description": "Valid values for enum types" }, + "range": { + "type": "object", + "properties": { + "min": { "type": "number" }, + "max": { "type": "number" } + } + }, + "scope": { + "type": "string", + "enum": ["field", "step", "datasource", "plan", "global"], + "description": "Configuration scope level" + }, + "yamlPath": { "type": "string", "description": "Path in YAML configuration" }, + "envVar": { "type": "string", "description": "Environment variable name" }, + "scalaConstant": { "type": "string", "description": "Scala constant name in source code" } + }, + "required": ["name", "type", "description"] + }, + "sourceFile": { + "type": "object", + "properties": { + "path": { "type": "string" }, + "role": { "type": "string", "enum": ["primary", "supporting", "test", "example"] }, + "classes": { "type": "array", "items": { "type": "string" } }, + "methods": { "type": "array", "items": { "type": "string" } } + }, + "required": ["path"] + }, + "example": { + "type": "object", + "properties": { + "format": { "type": "string", "enum": ["yaml", "scala", "java", "json", "shell"] }, + "title": { "type": "string" }, + "code": { "type": "string" }, + "description": { "type": "string" } + }, + "required": ["format", "code"] + } + } +} diff --git a/misc/feature-catalog/scripts/extract_features.py b/misc/feature-catalog/scripts/extract_features.py new file mode 100644 index 00000000..60e7870a --- /dev/null +++ b/misc/feature-catalog/scripts/extract_features.py @@ -0,0 +1,1739 @@ +#!/usr/bin/env python3 +""" +Feature extraction script for Data Caterer. + +Parses source code (Constants.scala, ConfigModels.scala), JSON schema, +and YAML examples to produce a comprehensive features.json catalog. + +Usage: + python scripts/extract_features.py + +Output: + ../features.json +""" + +import sys +from pathlib import Path + +# Add scripts dir to path +sys.path.insert(0, str(Path(__file__).parent)) + +from utils import ( + PROJECT_ROOT, FEATURE_CATALOG_DIR, read_file, save_json, + parse_scala_lazy_vals, parse_scala_case_class, load_json, + make_feature, make_config_option, make_example, make_feature_id, +) + + +# Source file paths +CONSTANTS_PATH = PROJECT_ROOT / "api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala" +CONFIG_MODELS_PATH = PROJECT_ROOT / "api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala" +UNIFIED_SCHEMA_PATH = PROJECT_ROOT / "misc/schema/unified-config-schema.json" +EXAMPLES_DIR = PROJECT_ROOT / "misc/schema/examples" + + +def extract_data_source_features(vals: list[dict]) -> list[dict]: + """Extract data source connector features from Constants.scala.""" + features = [] + + connectors = { + # Databases + 'postgres': { + 'name': 'PostgreSQL Connector', + 'subcategory': 'databases', + 'description': 'Connect to PostgreSQL databases for reading and writing data. Supports table-level configuration, custom queries, and JDBC options.', + 'tags': ['database', 'jdbc', 'relational', 'sql'], + 'config': [ + make_config_option('url', 'string', 'JDBC connection URL', required=True, scope='datasource', yaml_path='dataSources[].connection.options.url'), + make_config_option('user', 'string', 'Database username', scope='datasource', yaml_path='dataSources[].connection.options.user'), + make_config_option('password', 'string', 'Database password', scope='datasource', yaml_path='dataSources[].connection.options.password'), + make_config_option('driver', 'string', 'JDBC driver class', default='org.postgresql.Driver', scope='datasource'), + make_config_option('dbtable', 'string', 'Target table (schema.table)', scope='step', yaml_path='dataSources[].steps[].options.dbtable'), + make_config_option('query', 'string', 'Custom SQL query for reading', scope='step'), + ], + 'examples': [ + make_example('yaml', '''dataSources: + - name: my_postgres + connection: + type: postgres + options: + url: "jdbc:postgresql://localhost:5432/mydb" + user: "postgres" + password: "${POSTGRES_PASSWORD}" + steps: + - name: customers + options: + dbtable: "public.customers" + count: + records: 1000''', 'PostgreSQL data generation'), + ], + }, + 'mysql': { + 'name': 'MySQL Connector', + 'subcategory': 'databases', + 'description': 'Connect to MySQL databases for reading and writing data. Supports table-level configuration and JDBC options.', + 'tags': ['database', 'jdbc', 'relational', 'sql'], + 'config': [ + make_config_option('url', 'string', 'JDBC connection URL', required=True, scope='datasource'), + make_config_option('user', 'string', 'Database username', scope='datasource'), + make_config_option('password', 'string', 'Database password', scope='datasource'), + make_config_option('driver', 'string', 'JDBC driver class', default='com.mysql.cj.jdbc.Driver', scope='datasource'), + make_config_option('dbtable', 'string', 'Target table', scope='step'), + ], + 'examples': [ + make_example('yaml', '''connection: + type: mysql + options: + url: "jdbc:mysql://localhost:3306/mydb" + user: "root" + password: "${MYSQL_PASSWORD}"''', 'MySQL connection'), + ], + }, + 'cassandra': { + 'name': 'Cassandra Connector', + 'subcategory': 'databases', + 'description': 'Connect to Apache Cassandra for reading and writing data. Supports keyspace/table configuration, primary key and clustering positions.', + 'tags': ['database', 'nosql', 'wide-column'], + 'config': [ + make_config_option('url', 'string', 'Cassandra contact point URL', required=True, scope='datasource'), + make_config_option('user', 'string', 'Cassandra username', scope='datasource'), + make_config_option('password', 'string', 'Cassandra password', scope='datasource'), + make_config_option('keyspace', 'string', 'Cassandra keyspace', required=True, scope='step'), + make_config_option('table', 'string', 'Cassandra table', required=True, scope='step'), + ], + 'examples': [ + make_example('yaml', '''connection: + type: cassandra + options: + url: "localhost:9042" + user: "cassandra" + password: "cassandra"''', 'Cassandra connection'), + ], + }, + 'bigquery': { + 'name': 'BigQuery Connector', + 'subcategory': 'databases', + 'description': 'Connect to Google BigQuery for reading and writing data. Supports direct and indirect write methods.', + 'tags': ['database', 'cloud', 'google', 'data-warehouse'], + 'config': [ + make_config_option('table', 'string', 'BigQuery table (project.dataset.table)', required=True, scope='step'), + make_config_option('credentialsFile', 'string', 'Path to GCP credentials JSON', scope='datasource'), + make_config_option('writeMethod', 'string', 'Write method', default='indirect', scope='datasource', valid_values=['direct', 'indirect']), + make_config_option('temporaryGcsBucket', 'string', 'GCS bucket for indirect writes', scope='datasource'), + make_config_option('queryJobPriority', 'string', 'Query job priority', default='batch', scope='datasource'), + ], + 'examples': [], + }, + # File formats + 'csv': { + 'name': 'CSV File Connector', + 'subcategory': 'files', + 'description': 'Read and write CSV files. Supports headers, delimiters, and other CSV-specific options.', + 'tags': ['file', 'csv', 'delimited', 'text'], + 'config': [ + make_config_option('path', 'string', 'File system path for CSV files', required=True, scope='datasource', yaml_path='dataSources[].connection.options.path'), + ], + 'examples': [ + make_example('yaml', '''connection: + type: csv + options: + path: "/tmp/data/csv-output"''', 'CSV file output'), + ], + }, + 'json': { + 'name': 'JSON File Connector', + 'subcategory': 'files', + 'description': 'Read and write JSON files. Supports nested structures, arrays, and unwrapping top-level arrays.', + 'tags': ['file', 'json', 'structured'], + 'config': [ + make_config_option('path', 'string', 'File system path for JSON files', required=True, scope='datasource'), + make_config_option('unwrapTopLevelArray', 'boolean', 'Output JSON as root-level array instead of object', default=False, scope='step'), + ], + 'examples': [ + make_example('yaml', '''connection: + type: json + options: + path: "/tmp/data/json-output"''', 'JSON file output'), + ], + }, + 'parquet': { + 'name': 'Parquet File Connector', + 'subcategory': 'files', + 'description': 'Read and write Apache Parquet columnar files. Efficient for large datasets.', + 'tags': ['file', 'parquet', 'columnar', 'binary'], + 'config': [ + make_config_option('path', 'string', 'File system path for Parquet files', required=True, scope='datasource'), + ], + 'examples': [], + }, + 'orc': { + 'name': 'ORC File Connector', + 'subcategory': 'files', + 'description': 'Read and write Apache ORC columnar files.', + 'tags': ['file', 'orc', 'columnar', 'binary'], + 'config': [ + make_config_option('path', 'string', 'File system path for ORC files', required=True, scope='datasource'), + ], + 'examples': [], + }, + 'delta': { + 'name': 'Delta Lake Connector', + 'subcategory': 'files', + 'description': 'Read and write Delta Lake tables. Supports ACID transactions, time travel, and schema evolution.', + 'tags': ['file', 'delta', 'lakehouse', 'acid'], + 'config': [ + make_config_option('path', 'string', 'File system path for Delta tables', required=True, scope='datasource'), + ], + 'examples': [], + }, + 'iceberg': { + 'name': 'Apache Iceberg Connector', + 'subcategory': 'files', + 'description': 'Read and write Apache Iceberg tables. Supports multiple catalog types (Hadoop, Hive, REST, Glue, JDBC, Nessie).', + 'tags': ['file', 'iceberg', 'lakehouse', 'catalog'], + 'config': [ + make_config_option('path', 'string', 'Table path', required=True, scope='datasource'), + make_config_option('catalogType', 'string', 'Iceberg catalog type', default='hadoop', scope='datasource', + valid_values=['hadoop', 'hive', 'rest', 'glue', 'jdbc', 'nessie']), + make_config_option('catalogUri', 'string', 'Catalog URI (for hive/rest/nessie)', scope='datasource'), + make_config_option('catalogDefaultNamespace', 'string', 'Default namespace', scope='datasource'), + ], + 'examples': [], + }, + 'hudi': { + 'name': 'Apache Hudi Connector', + 'subcategory': 'files', + 'description': 'Read and write Apache Hudi tables.', + 'tags': ['file', 'hudi', 'lakehouse'], + 'config': [ + make_config_option('path', 'string', 'Table path', required=True, scope='datasource'), + make_config_option('hoodie.table.name', 'string', 'Hudi table name', required=True, scope='step'), + ], + 'examples': [], + }, + 'xml': { + 'name': 'XML File Connector', + 'subcategory': 'files', + 'description': 'Read and write XML files.', + 'tags': ['file', 'xml', 'structured'], + 'config': [ + make_config_option('path', 'string', 'File system path for XML files', required=True, scope='datasource'), + ], + 'examples': [], + }, + # Messaging + 'kafka': { + 'name': 'Apache Kafka Connector', + 'subcategory': 'messaging', + 'description': 'Connect to Apache Kafka for producing and consuming messages. Supports topics, partitions, headers, key/value serialization, and streaming patterns.', + 'tags': ['messaging', 'kafka', 'streaming', 'event'], + 'config': [ + make_config_option('url', 'string', 'Kafka bootstrap servers', required=True, scope='datasource', yaml_path='dataSources[].connection.options.url'), + make_config_option('topic', 'string', 'Kafka topic name', required=True, scope='step'), + make_config_option('schemaLocation', 'string', 'Schema registry URL or file path', scope='datasource'), + ], + 'examples': [ + make_example('yaml', '''dataSources: + - name: my_kafka + connection: + type: kafka + options: + url: "localhost:9092" + steps: + - name: orders_topic + options: + topic: "orders" + count: + duration: "1m" + rate: 100 + rateUnit: "second"''', 'Kafka streaming'), + ], + }, + 'solace': { + 'name': 'Solace JMS Connector', + 'subcategory': 'messaging', + 'description': 'Connect to Solace PubSub+ message broker via JMS. Supports queues and topics.', + 'tags': ['messaging', 'jms', 'solace'], + 'config': [ + make_config_option('url', 'string', 'Solace broker URL', required=True, scope='datasource'), + make_config_option('user', 'string', 'Username', scope='datasource'), + make_config_option('password', 'string', 'Password', scope='datasource'), + make_config_option('vpnName', 'string', 'VPN name', default='default', scope='datasource'), + make_config_option('connectionFactory', 'string', 'JNDI connection factory', default='/jms/cf/default', scope='datasource'), + make_config_option('initialContextFactory', 'string', 'JNDI context factory', scope='datasource'), + make_config_option('destinationName', 'string', 'Queue/topic destination', required=True, scope='step'), + ], + 'examples': [], + }, + 'rabbitmq': { + 'name': 'RabbitMQ Connector', + 'subcategory': 'messaging', + 'description': 'Connect to RabbitMQ message broker via JMS. Supports queues.', + 'tags': ['messaging', 'rabbitmq', 'jms', 'amqp'], + 'config': [ + make_config_option('url', 'string', 'RabbitMQ URL', required=True, scope='datasource'), + make_config_option('user', 'string', 'Username', default='guest', scope='datasource'), + make_config_option('password', 'string', 'Password', default='guest', scope='datasource'), + make_config_option('virtualHost', 'string', 'Virtual host', default='/', scope='datasource'), + make_config_option('destinationName', 'string', 'Queue name', required=True, scope='step'), + ], + 'examples': [], + }, + # HTTP + 'http': { + 'name': 'HTTP/REST API Connector', + 'subcategory': 'http', + 'description': 'Send generated data to HTTP/REST APIs. Supports custom methods, headers, URL path parameters, query parameters, and request bodies.', + 'tags': ['http', 'rest', 'api', 'web'], + 'config': [ + make_config_option('url', 'string', 'Base URL for HTTP requests', required=True, scope='datasource'), + ], + 'examples': [ + make_example('yaml', '''dataSources: + - name: my_api + connection: + type: http + options: + url: "http://localhost:8080" + steps: + - name: create_users + fields: + - name: httpUrl + type: struct + fields: + - name: url + static: "http://localhost:8080/api/users" + - name: method + static: "POST" + - name: httpBody + type: struct + fields: + - name: name + options: + expression: "#{Name.fullName}"''', 'HTTP API data generation'), + ], + }, + } + + for key, info in connectors.items(): + features.append(make_feature( + id=make_feature_id('connector', info['subcategory'], key), + name=info['name'], + category='connectors', + subcategory=info['subcategory'], + status='stable', + description=info['description'], + configuration=info['config'], + examples=info.get('examples', []), + tags=info.get('tags', []), + source_files=[ + {'path': 'api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala', 'role': 'supporting'}, + ], + )) + + return features + + +def extract_field_generation_features() -> list[dict]: + """Extract data generation features for field types and generators.""" + features = [] + + # Core generator types + generators = [ + { + 'id': 'generation.field.regex', + 'name': 'Regex Pattern Generation', + 'description': 'Generate string values matching a regular expression pattern. Supports SQL-based optimization for common patterns with automatic fallback to UDF for complex patterns (lookaheads, backreferences).', + 'config': [ + make_config_option('regex', 'string', 'Regular expression pattern to generate values from', required=True, scope='field', yaml_path='fields[].options.regex'), + ], + 'examples': [ + make_example('yaml', '''- name: account_id + options: + regex: "ACC[0-9]{8}"''', 'Simple regex pattern'), + make_example('yaml', '''- name: product_code + options: + regex: "[A-Z]{3}-[0-9]{4}"''', 'Alphanumeric pattern'), + make_example('scala', 'field.name("account_id").regex("ACC[0-9]{8}")', 'Scala API'), + ], + 'tags': ['generation', 'string', 'pattern', 'regex'], + 'related': ['configuration.flags.enable_fast_generation'], + 'performance': ['SQL-based optimization available via enableFastGeneration flag', 'Complex patterns (lookaheads, backreferences) automatically fall back to UDF'], + 'source_files': [ + {'path': 'app/src/main/scala/io/github/datacatering/datacaterer/core/generator/provider/regex/RegexPatternParser.scala', 'role': 'primary'}, + ], + }, + { + 'id': 'generation.field.expression', + 'name': 'DataFaker Expression', + 'description': 'Generate realistic fake data using DataFaker library expressions. Supports names, addresses, emails, phone numbers, and hundreds of other data types.', + 'config': [ + make_config_option('expression', 'string', 'DataFaker expression (e.g., #{Name.firstName})', required=True, scope='field', yaml_path='fields[].options.expression'), + ], + 'examples': [ + make_example('yaml', '''- name: full_name + options: + expression: "#{Name.fullName}"''', 'Full name generation'), + make_example('yaml', '''- name: email + options: + expression: "#{Internet.emailAddress}"''', 'Email generation'), + make_example('scala', 'field.name("email").expression("#{Internet.emailAddress}")', 'Scala API'), + ], + 'tags': ['generation', 'faker', 'realistic', 'expression'], + 'deps': {'libraries': ['net.datafaker:datafaker']}, + }, + { + 'id': 'generation.field.one_of', + 'name': 'One-Of Selection', + 'description': 'Generate values by randomly selecting from a predefined list of options. Useful for categorical data like statuses, types, and enums.', + 'config': [ + make_config_option('oneOf', 'array', 'List of values to randomly select from', required=True, scope='field', yaml_path='fields[].options.oneOf'), + ], + 'examples': [ + make_example('yaml', '''- name: status + options: + oneOf: ["active", "inactive", "pending"]''', 'Enum field'), + make_example('scala', 'field.name("status").oneOf("active", "inactive", "pending")', 'Scala API'), + ], + 'tags': ['generation', 'enum', 'categorical', 'selection'], + }, + { + 'id': 'generation.field.sql', + 'name': 'SQL Expression', + 'description': 'Generate field values using Spark SQL expressions. Supports referencing other fields, date functions, string operations, aggregations, and conditional logic.', + 'config': [ + make_config_option('sql', 'string', 'Spark SQL expression for computed value', required=True, scope='field', yaml_path='fields[].options.sql'), + ], + 'examples': [ + make_example('yaml', '''- name: year + type: integer + options: + sql: "YEAR(created_at)"''', 'Extract year from date field'), + make_example('yaml', '''- name: full_name + type: string + options: + sql: "CONCAT(first_name, \' \', last_name)"''', 'Concatenate fields'), + make_example('yaml', '''- name: total_amount + type: double + options: + sql: "quantity * unit_price"''', 'Computed field'), + ], + 'tags': ['generation', 'sql', 'computed', 'derived'], + }, + { + 'id': 'generation.field.static', + 'name': 'Static Value', + 'description': 'Set a fixed static value for all generated records. Useful for constant fields like API endpoints, methods, or content types.', + 'config': [ + make_config_option('static', 'string', 'Fixed value for all records', required=True, scope='field', yaml_path='fields[].static'), + ], + 'examples': [ + make_example('yaml', '''- name: method + static: "POST"''', 'Static HTTP method'), + ], + 'tags': ['generation', 'static', 'constant'], + }, + { + 'id': 'generation.field.uuid', + 'name': 'UUID Generation', + 'description': 'Generate universally unique identifiers (UUID v4).', + 'config': [ + make_config_option('uuidPattern', 'boolean', 'Enable UUID generation', default=False, scope='field', yaml_path='fields[].options.uuidPattern'), + ], + 'examples': [ + make_example('yaml', '''- name: id + options: + uuidPattern: true''', 'UUID field'), + ], + 'tags': ['generation', 'uuid', 'identifier', 'unique'], + }, + { + 'id': 'generation.field.sequence', + 'name': 'Sequential Value Generation', + 'description': 'Generate sequential values with optional prefix and padding. Useful for IDs, batch numbers, and sequential identifiers.', + 'config': [ + make_config_option('sequence', 'object', 'Sequential value configuration with prefix and padding', required=True, scope='field', yaml_path='fields[].options.sequence'), + ], + 'examples': [ + make_example('yaml', '''- name: order_id + options: + sequence: + start: 1000 + step: 1 + prefix: "ORD-" + padding: 8''', 'Sequential order IDs'), + ], + 'tags': ['generation', 'sequence', 'sequential', 'incremental'], + }, + { + 'id': 'generation.field.conditional_value', + 'name': 'Conditional Value Generation', + 'description': 'Generate values using CASE WHEN logic based on other field values. Enables dependent field generation.', + 'config': [ + make_config_option('conditionalValue', 'object', 'CASE WHEN conditions and result values', required=True, scope='field', yaml_path='fields[].options.conditionalValue'), + ], + 'examples': [ + make_example('yaml', '''- name: discount + type: double + options: + conditionalValue: + conditions: + - expr: "customer_type = \'premium\'" + value: 0.2 + - expr: "customer_type = \'standard\'" + value: 0.1 + default: 0.0''', 'Conditional discount'), + ], + 'tags': ['generation', 'conditional', 'logic', 'derived'], + }, + { + 'id': 'generation.field.correlated', + 'name': 'Correlated Field Generation', + 'description': 'Generate values that are correlated (or negatively correlated) with another field. Useful for creating realistic relationships between numeric fields.', + 'config': [ + make_config_option('correlatedWith', 'string', 'Field name to correlate with', scope='field', yaml_path='fields[].options.correlatedWith'), + make_config_option('negativelyCorrelatedWith', 'string', 'Field name to negatively correlate with', scope='field', yaml_path='fields[].options.negativelyCorrelatedWith'), + ], + 'examples': [ + make_example('yaml', '''- name: revenue + type: double + options: + correlatedWith: "customer_count"''', 'Positively correlated fields'), + ], + 'tags': ['generation', 'correlation', 'statistical', 'relationship'], + }, + { + 'id': 'generation.field.mapping', + 'name': 'Value Mapping', + 'description': 'Map values from one field to generate deterministic output in another field.', + 'config': [ + make_config_option('mapping', 'object', 'Mapping configuration from source field to output values', required=True, scope='field', yaml_path='fields[].options.mapping'), + ], + 'examples': [ + make_example('yaml', '''- name: country_code + options: + mapping: + sourceField: "country" + mappings: + "United States": "US" + "United Kingdom": "UK"''', 'Country code mapping'), + ], + 'tags': ['generation', 'mapping', 'lookup', 'derived'], + }, + ] + + for gen in generators: + features.append(make_feature( + id=gen['id'], + name=gen['name'], + category='generation', + subcategory='generators', + status='stable', + description=gen['description'], + configuration=gen.get('config', []), + examples=gen.get('examples', []), + tags=gen.get('tags', []), + related_features=gen.get('related', []), + source_files=gen.get('source_files', []), + dependencies=gen.get('deps'), + performance_notes=gen.get('performance', []), + )) + + return features + + +def extract_field_option_features() -> list[dict]: + """Extract field-level configuration options as features.""" + features = [] + + # Data types + data_types = [ + ('string', 'String', 'Text data type. Default field type.'), + ('integer', 'Integer', '32-bit integer values.'), + ('long', 'Long', '64-bit long integer values.'), + ('double', 'Double', 'Double-precision floating point values.'), + ('float', 'Float', 'Single-precision floating point values.'), + ('decimal', 'Decimal', 'Fixed-precision decimal values with configurable precision and scale.'), + ('boolean', 'Boolean', 'True/false boolean values.'), + ('date', 'Date', 'Date values (year-month-day).'), + ('timestamp', 'Timestamp', 'Timestamp values with date and time.'), + ('binary', 'Binary', 'Binary byte array values.'), + ('array', 'Array', 'Array/list of elements. Configurable element type, min/max length.'), + ('struct', 'Struct', 'Nested structure with named fields. Supports deep nesting.'), + ('map', 'Map', 'Key-value map type with configurable key and value types.'), + ] + + for type_id, name, desc in data_types: + features.append(make_feature( + id=make_feature_id('generation', 'type', type_id), + name=f'{name} Type', + category='generation', + subcategory='data_types', + status='stable', + description=desc, + configuration=[ + make_config_option('type', 'string', f'Set field type to "{type_id}"', scope='field', yaml_path='fields[].type'), + ], + tags=['generation', 'type', type_id], + )) + + # Numeric field options + features.append(make_feature( + id='generation.option.numeric_range', + name='Numeric Range', + category='generation', + subcategory='field_options', + status='stable', + description='Constrain numeric fields (integer, long, double, float, decimal) to a minimum and maximum range.', + configuration=[ + make_config_option('min', 'any', 'Minimum value (inclusive)', scope='field', yaml_path='fields[].options.min', scala_constant='MINIMUM'), + make_config_option('max', 'any', 'Maximum value (inclusive)', scope='field', yaml_path='fields[].options.max', scala_constant='MAXIMUM'), + ], + examples=[ + make_example('yaml', '''- name: age + type: integer + options: + min: 18 + max: 120''', 'Integer range'), + make_example('yaml', '''- name: price + type: double + options: + min: 9.99 + max: 999.99''', 'Double range'), + ], + tags=['generation', 'numeric', 'range', 'constraint'], + )) + + # Date/time field options + features.append(make_feature( + id='generation.option.date_range', + name='Date/Time Range', + category='generation', + subcategory='field_options', + status='stable', + description='Constrain date and timestamp fields to a minimum and maximum range. Also supports excluding weekends, business hours, within/future days.', + configuration=[ + make_config_option('min', 'string', 'Minimum date/timestamp', scope='field', yaml_path='fields[].options.min'), + make_config_option('max', 'string', 'Maximum date/timestamp', scope='field', yaml_path='fields[].options.max'), + make_config_option('excludeWeekends', 'boolean', 'Exclude Saturday and Sunday', default=False, scope='field', yaml_path='fields[].options.excludeWeekends', scala_constant='DATE_EXCLUDE_WEEKENDS'), + make_config_option('withinDays', 'integer', 'Generate dates within last N days from now', scope='field', yaml_path='fields[].options.withinDays'), + make_config_option('futureDays', 'integer', 'Generate dates within next N days from now', scope='field', yaml_path='fields[].options.futureDays'), + make_config_option('businessHours', 'boolean', 'Restrict to business hours', default=False, scope='field', yaml_path='fields[].options.businessHours'), + make_config_option('timeBetween', 'object', 'Generate times between start and end', scope='field', yaml_path='fields[].options.timeBetween'), + ], + examples=[ + make_example('yaml', '''- name: created_at + type: timestamp + options: + min: "2024-01-01T00:00:00" + max: "2024-12-31T23:59:59"''', 'Timestamp range'), + ], + tags=['generation', 'date', 'timestamp', 'range'], + )) + + # Null handling + features.append(make_feature( + id='generation.option.null_handling', + name='Null Value Control', + category='generation', + subcategory='field_options', + status='stable', + description='Control whether and how often null values appear in generated data. Configurable null probability per field.', + configuration=[ + make_config_option('enableNull', 'boolean', 'Allow null values for this field', default=False, scope='field', yaml_path='fields[].options.enableNull', scala_constant='ENABLED_NULL'), + make_config_option('nullProb', 'double', 'Probability of generating a null value (0-1)', scope='field', yaml_path='fields[].options.nullProb', scala_constant='PROBABILITY_OF_NULL'), + make_config_option('nullable', 'boolean', 'Whether the field schema allows nulls', default=True, scope='field', yaml_path='fields[].nullable'), + ], + examples=[ + make_example('yaml', '''- name: middle_name + options: + enableNull: true + nullProb: 0.3''', '30% null probability'), + ], + tags=['generation', 'null', 'nullable', 'probability'], + )) + + # Edge case handling + features.append(make_feature( + id='generation.option.edge_cases', + name='Edge Case Generation', + category='generation', + subcategory='field_options', + status='stable', + description='Control the probability of generating edge case values (empty strings, boundary values, special characters).', + configuration=[ + make_config_option('enableEdgeCase', 'boolean', 'Enable edge case generation', default=False, scope='field', scala_constant='ENABLED_EDGE_CASE'), + make_config_option('edgeCaseProb', 'double', 'Probability of generating edge case values (0-1)', scope='field', scala_constant='PROBABILITY_OF_EDGE_CASE'), + ], + tags=['generation', 'edge-case', 'boundary', 'testing'], + )) + + # String length + features.append(make_feature( + id='generation.option.string_length', + name='String Length Control', + category='generation', + subcategory='field_options', + status='stable', + description='Control the length of generated string values with minimum, maximum, and average length.', + configuration=[ + make_config_option('minLen', 'integer', 'Minimum string length', scope='field', scala_constant='MINIMUM_LENGTH'), + make_config_option('maxLen', 'integer', 'Maximum string length', scope='field', scala_constant='MAXIMUM_LENGTH'), + make_config_option('avgLen', 'integer', 'Average string length', scope='field', scala_constant='AVERAGE_LENGTH'), + ], + tags=['generation', 'string', 'length', 'constraint'], + )) + + # Array options + features.append(make_feature( + id='generation.option.array_config', + name='Array Configuration', + category='generation', + subcategory='field_options', + status='stable', + description='Configure array field generation: element count, element type, uniqueness, empty probability, and weighted selection.', + configuration=[ + make_config_option('arrayMinLen', 'integer', 'Minimum array length', scope='field', yaml_path='fields[].options.arrayMinLength', scala_constant='ARRAY_MINIMUM_LENGTH'), + make_config_option('arrayMaxLen', 'integer', 'Maximum array length', scope='field', yaml_path='fields[].options.arrayMaxLength', scala_constant='ARRAY_MAXIMUM_LENGTH'), + make_config_option('arrayFixedSize', 'integer', 'Fixed array size', scope='field', scala_constant='ARRAY_FIXED_SIZE'), + make_config_option('arrayEmptyProb', 'double', 'Probability of empty array (0-1)', scope='field', yaml_path='fields[].options.arrayEmptyProbability', scala_constant='ARRAY_EMPTY_PROBABILITY'), + make_config_option('arrayType', 'string', 'Element data type for array', scope='field', scala_constant='ARRAY_TYPE'), + make_config_option('arrayOneOf', 'string', 'Comma-separated values for array elements', scope='field', scala_constant='ARRAY_ONE_OF'), + make_config_option('arrayUniqueFrom', 'string', 'Source for unique array elements', scope='field', scala_constant='ARRAY_UNIQUE_FROM'), + make_config_option('arrayWeightedOneOf', 'string', 'Weighted selection for elements (e.g., HIGH:0.2,MEDIUM:0.5,LOW:0.3)', scope='field', yaml_path='fields[].options.arrayWeightedOneOf', scala_constant='ARRAY_WEIGHTED_ONE_OF'), + ], + tags=['generation', 'array', 'collection', 'nested'], + )) + + # Map options + features.append(make_feature( + id='generation.option.map_config', + name='Map Configuration', + category='generation', + subcategory='field_options', + status='stable', + description='Configure map field generation with minimum and maximum size.', + configuration=[ + make_config_option('mapMinSize', 'integer', 'Minimum number of entries', scope='field', scala_constant='MAP_MINIMUM_SIZE'), + make_config_option('mapMaxSize', 'integer', 'Maximum number of entries', scope='field', scala_constant='MAP_MAXIMUM_SIZE'), + ], + tags=['generation', 'map', 'key-value', 'nested'], + )) + + # Distribution + features.append(make_feature( + id='generation.option.distribution', + name='Value Distribution', + category='generation', + subcategory='field_options', + status='stable', + description='Control the statistical distribution of generated numeric values. Supports uniform, normal, and exponential distributions.', + configuration=[ + make_config_option('distribution', 'enum', 'Distribution type', scope='field', valid_values=['uniform', 'normal', 'exponential'], scala_constant='DISTRIBUTION'), + make_config_option('mean', 'double', 'Mean for normal distribution', scope='field', scala_constant='MEAN'), + make_config_option('stddev', 'double', 'Standard deviation for normal distribution', scope='field', scala_constant='STANDARD_DEVIATION'), + make_config_option('distributionRateParam', 'double', 'Rate parameter for exponential distribution', scope='field', scala_constant='DISTRIBUTION_RATE_PARAMETER'), + ], + tags=['generation', 'distribution', 'statistical', 'normal', 'uniform'], + )) + + # Uniqueness + features.append(make_feature( + id='generation.option.uniqueness', + name='Uniqueness Constraint', + category='generation', + subcategory='field_options', + status='stable', + description='Enforce unique values for a field using bloom filter-based deduplication.', + configuration=[ + make_config_option('isUnique', 'boolean', 'Enforce unique values', default=False, scope='field', yaml_path='fields[].options.isUnique', scala_constant='IS_UNIQUE'), + make_config_option('isPrimaryKey', 'boolean', 'Mark as primary key (implies unique)', default=False, scope='field', yaml_path='fields[].options.isPrimaryKey', scala_constant='IS_PRIMARY_KEY'), + make_config_option('primaryKeyPos', 'integer', 'Position in composite primary key', scope='field', scala_constant='PRIMARY_KEY_POSITION'), + ], + tags=['generation', 'unique', 'primary-key', 'constraint'], + related_features=['configuration.flags.enable_unique_check'], + )) + + # Numeric precision + features.append(make_feature( + id='generation.option.numeric_precision', + name='Numeric Precision and Scale', + category='generation', + subcategory='field_options', + status='stable', + description='Control precision and scale for decimal fields, and rounding for numeric fields.', + configuration=[ + make_config_option('precision', 'integer', 'Numeric precision (total digits)', scope='field', scala_constant='NUMERIC_PRECISION'), + make_config_option('scale', 'integer', 'Numeric scale (decimal places)', scope='field', scala_constant='NUMERIC_SCALE'), + make_config_option('round', 'integer', 'Round numeric values to N decimal places', scope='field', scala_constant='ROUND'), + ], + tags=['generation', 'numeric', 'precision', 'decimal'], + )) + + # Omit field + features.append(make_feature( + id='generation.option.omit', + name='Field Omission', + category='generation', + subcategory='field_options', + status='stable', + description='Generate a field for use in computed expressions but omit it from the final output.', + configuration=[ + make_config_option('omit', 'boolean', 'Omit field from output', default=False, scope='field', yaml_path='fields[].options.omit', scala_constant='OMIT'), + ], + tags=['generation', 'omit', 'helper', 'computed'], + )) + + # Seed + features.append(make_feature( + id='generation.option.seed', + name='Random Seed', + category='generation', + subcategory='field_options', + status='stable', + description='Set a random seed for reproducible data generation per field.', + configuration=[ + make_config_option('seed', 'integer', 'Random seed for reproducible generation', scope='field', yaml_path='fields[].options.seed', scala_constant='RANDOM_SEED'), + ], + tags=['generation', 'seed', 'reproducible', 'deterministic'], + )) + + # Distinct count / histogram + features.append(make_feature( + id='generation.option.distinct_count', + name='Distinct Value Count', + category='generation', + subcategory='field_options', + status='stable', + description='Control how many distinct values are generated for a field. Used with metadata-driven generation.', + configuration=[ + make_config_option('distinctCount', 'integer', 'Number of distinct values to generate', scope='field', scala_constant='DISTINCT_COUNT'), + make_config_option('histogram', 'object', 'Value distribution histogram', scope='field', scala_constant='HISTOGRAM'), + ], + tags=['generation', 'distinct', 'cardinality', 'metadata'], + )) + + # Cassandra-specific + features.append(make_feature( + id='generation.option.cassandra_keys', + name='Cassandra Key Configuration', + category='generation', + subcategory='field_options', + status='stable', + description='Configure Cassandra-specific primary key and clustering positions for fields.', + configuration=[ + make_config_option('isPrimaryKey', 'boolean', 'Mark as partition key', scope='field', scala_constant='IS_PRIMARY_KEY'), + make_config_option('primaryKeyPos', 'integer', 'Position in composite partition key', scope='field', scala_constant='PRIMARY_KEY_POSITION'), + make_config_option('clusteringPos', 'integer', 'Clustering column position', scope='field', scala_constant='CLUSTERING_POSITION'), + ], + tags=['generation', 'cassandra', 'primary-key', 'clustering'], + )) + + # Incremental + features.append(make_feature( + id='generation.option.incremental', + name='Incremental Generation', + category='generation', + subcategory='field_options', + status='stable', + description='Mark a field for incremental generation, tracking the last generated value across runs.', + configuration=[ + make_config_option('incremental', 'boolean', 'Enable incremental mode', default=False, scope='field', scala_constant='INCREMENTAL'), + ], + tags=['generation', 'incremental', 'tracking'], + )) + + # HTTP parameter type + features.append(make_feature( + id='generation.option.http_param_type', + name='HTTP Parameter Type', + category='generation', + subcategory='field_options', + status='stable', + description='Specify the HTTP parameter type for a field when using the HTTP connector (path, query, or header).', + configuration=[ + make_config_option('httpParamType', 'enum', 'HTTP parameter placement', scope='field', valid_values=['path', 'query', 'header'], scala_constant='HTTP_PARAMETER_TYPE'), + ], + tags=['generation', 'http', 'parameter', 'api'], + )) + + # Post SQL expression + features.append(make_feature( + id='generation.option.post_sql_expression', + name='Post-SQL Expression', + category='generation', + subcategory='field_options', + status='stable', + description='Apply a SQL expression to transform the field value after initial generation.', + configuration=[ + make_config_option('postSqlExpression', 'string', 'SQL expression to apply after generation', scope='field', scala_constant='POST_SQL_EXPRESSION'), + ], + tags=['generation', 'sql', 'transform', 'post-processing'], + )) + + # Semantic version + features.append(make_feature( + id='generation.field.semantic_version', + name='Semantic Version Generation', + category='generation', + subcategory='generators', + status='stable', + description='Generate semantic version strings (e.g., 1.2.3).', + configuration=[ + make_config_option('semanticVersion', 'object', 'Semantic version configuration', scope='field', yaml_path='fields[].options.semanticVersion'), + ], + tags=['generation', 'version', 'semver'], + )) + + # Daily batch sequence + features.append(make_feature( + id='generation.field.daily_batch_sequence', + name='Daily Batch Sequence', + category='generation', + subcategory='generators', + status='stable', + description='Generate daily batch sequence identifiers.', + configuration=[ + make_config_option('dailyBatchSequence', 'object', 'Daily batch sequence configuration', scope='field', yaml_path='fields[].options.dailyBatchSequence'), + ], + tags=['generation', 'batch', 'daily', 'sequence'], + )) + + return features + + +def extract_field_label_features() -> list[dict]: + """Extract field label features for metadata-driven generation.""" + labels = [ + ('name', 'Name', 'Generate person name fields (first name, last name, full name)'), + ('username', 'Username', 'Generate username fields'), + ('address', 'Address', 'Generate address fields (street, city, postcode)'), + ('app', 'Application', 'Generate application-related fields (version)'), + ('nation', 'Nation', 'Generate nationality, language, capital city'), + ('money', 'Money', 'Generate currency and financial fields'), + ('internet', 'Internet', 'Generate email, IP, MAC address fields'), + ('food', 'Food', 'Generate food and ingredient fields'), + ('job', 'Job', 'Generate job title, field, position'), + ('relationship', 'Relationship', 'Generate relationship type fields'), + ('weather', 'Weather', 'Generate weather description fields'), + ('phone', 'Phone', 'Generate phone number fields'), + ('geo', 'Geo', 'Generate geographic coordinate fields'), + ] + + features = [] + for label_id, name, desc in labels: + features.append(make_feature( + id=make_feature_id('generation', 'label', label_id), + name=f'{name} Label', + category='generation', + subcategory='labels', + status='stable', + description=f'{desc}. Used for metadata-driven field generation to automatically select appropriate data generators.', + configuration=[ + make_config_option('label', 'string', f'Set field label to "{label_id}" for auto-detection', scope='field', scala_constant='FIELD_LABEL'), + ], + tags=['generation', 'label', 'metadata', label_id], + )) + + return features + + +def extract_validation_features() -> list[dict]: + """Extract validation features from Constants.scala.""" + features = [] + + # Field-level validations + field_validations = [ + ('null', 'Null Check', 'Validate that a field is null (or not null with negate=true).'), + ('unique', 'Unique Values', 'Validate that all values in a field are unique.'), + ('equal', 'Equality Check', 'Validate that field values equal a specified value.'), + ('contains', 'Contains Check', 'Validate that string field values contain a substring.'), + ('starts_with', 'Starts With', 'Validate that string field values start with a prefix.'), + ('ends_with', 'Ends With', 'Validate that string field values end with a suffix.'), + ('less_than', 'Less Than', 'Validate that numeric values are less than a threshold.'), + ('greater_than', 'Greater Than', 'Validate that numeric values are greater than a threshold.'), + ('between', 'Between Range', 'Validate that values fall within a min/max range (inclusive).'), + ('in', 'In Set', 'Validate that values exist in a specified set of allowed values.'), + ('matches', 'Regex Match', 'Validate that string values match a regular expression pattern.'), + ('matches_list', 'Regex Match List', 'Validate that string values match one of multiple regex patterns.'), + ('size', 'Size Check', 'Validate the size/length of a collection or string field.'), + ('less_than_size', 'Less Than Size', 'Validate that collection size is less than a threshold.'), + ('greater_than_size', 'Greater Than Size', 'Validate that collection size is greater than a threshold.'), + ('length_between', 'Length Between', 'Validate that string length falls within a range.'), + ('length_equal', 'Length Equal', 'Validate that string length equals a specific value.'), + ('luhn_check', 'Luhn Check', 'Validate values using the Luhn algorithm (credit cards, IDs).'), + ('has_type', 'Type Check', 'Validate that field values are of a specific data type.'), + ('has_types', 'Multi-Type Check', 'Validate that field values match one of multiple types.'), + ('is_decreasing', 'Monotonically Decreasing', 'Validate that values are in decreasing order.'), + ('is_increasing', 'Monotonically Increasing', 'Validate that values are in increasing order.'), + ('is_json_parsable', 'JSON Parsable', 'Validate that string values are valid JSON.'), + ('match_json_schema', 'JSON Schema Match', 'Validate that JSON values conform to a JSON schema.'), + ('match_date_time_format', 'DateTime Format Match', 'Validate that values match a specific date/time format.'), + ('distinct_in_set', 'Distinct In Set', 'Validate that all distinct values exist in a specified set.'), + ('distinct_contains_set', 'Distinct Contains Set', 'Validate that distinct values contain all values from a specified set.'), + ('distinct_equal', 'Distinct Equal', 'Validate that the set of distinct values exactly equals a specified set.'), + ('most_common_value_in_set', 'Most Common Value', 'Validate that the most common value is in a specified set.'), + ] + + for val_id, name, desc in field_validations: + features.append(make_feature( + id=make_feature_id('validation', 'field', val_id), + name=name, + category='validation', + subcategory='field_validations', + status='stable', + description=desc, + configuration=[ + make_config_option('type', 'string', f'Set validation type to "{val_id.replace("_", "")}"', scope='field'), + make_config_option('negate', 'boolean', 'Invert the validation result', default=False, scope='field'), + make_config_option('errorThreshold', 'double', 'Allowed error rate (0-1)', scope='field'), + make_config_option('description', 'string', 'Human-readable description', scope='field'), + ], + tags=['validation', 'field', val_id.replace('_', '-')], + )) + + # Statistical validations + stat_validations = [ + ('max_between', 'Max Between', 'Validate that the maximum value of a field falls within a range.'), + ('mean_between', 'Mean Between', 'Validate that the mean value of a field falls within a range.'), + ('median_between', 'Median Between', 'Validate that the median value of a field falls within a range.'), + ('min_between', 'Min Between', 'Validate that the minimum value of a field falls within a range.'), + ('std_dev_between', 'Std Dev Between', 'Validate that the standard deviation of a field falls within a range.'), + ('sum_between', 'Sum Between', 'Validate that the sum of a field falls within a range.'), + ('unique_values_proportion_between', 'Unique Values Proportion', 'Validate that the proportion of unique values falls within a range.'), + ('quantile_values_between', 'Quantile Values Between', 'Validate that quantile values fall within specified ranges.'), + ] + + for val_id, name, desc in stat_validations: + features.append(make_feature( + id=make_feature_id('validation', 'statistical', val_id), + name=name, + category='validation', + subcategory='statistical_validations', + status='stable', + description=desc, + configuration=[ + make_config_option('type', 'string', f'Set validation type', scope='field'), + make_config_option('min', 'any', 'Minimum expected value', scope='field'), + make_config_option('max', 'any', 'Maximum expected value', scope='field'), + ], + tags=['validation', 'statistical', val_id.replace('_', '-')], + )) + + # Expression validation + features.append(make_feature( + id='validation.expression', + name='SQL Expression Validation', + category='validation', + subcategory='expression_validations', + status='stable', + description='Validate data using arbitrary Spark SQL expressions that must evaluate to true. The most flexible validation type.', + configuration=[ + make_config_option('expr', 'string', 'SQL expression that must evaluate to true', required=True, scope='step', yaml_path='validations[].expr'), + make_config_option('selectExpr', 'array', 'SELECT columns for the expression', scope='step', yaml_path='validations[].selectExpr'), + make_config_option('preFilterExpr', 'string', 'SQL filter to apply before validation', scope='step', yaml_path='validations[].preFilterExpr'), + make_config_option('description', 'string', 'Human-readable description', scope='step'), + make_config_option('errorThreshold', 'double', 'Allowed error rate (0-1)', scope='step'), + ], + examples=[ + make_example('yaml', '''validations: + - expr: "age >= 18 AND age <= 120" + description: "Age must be valid"''', 'Expression validation'), + ], + tags=['validation', 'expression', 'sql', 'flexible'], + )) + + # Group-by validation + features.append(make_feature( + id='validation.group_by', + name='Group By Aggregation Validation', + category='validation', + subcategory='aggregation_validations', + status='stable', + description='Validate aggregated data grouped by specified fields. Supports sum, avg, min, max, count, and stddev aggregations.', + configuration=[ + make_config_option('groupByFields', 'array', 'Fields to group by', required=True, scope='step', yaml_path='validations[].groupByFields'), + make_config_option('aggField', 'string', 'Field to aggregate', scope='step', yaml_path='validations[].aggField'), + make_config_option('aggType', 'enum', 'Aggregation function', scope='step', yaml_path='validations[].aggType', valid_values=['sum', 'avg', 'min', 'max', 'count', 'stddev']), + make_config_option('aggExpr', 'string', 'Custom aggregation expression', scope='step', yaml_path='validations[].aggExpr'), + ], + examples=[ + make_example('yaml', '''validations: + - groupByFields: ["status"] + aggField: "balance" + aggType: "avg" + aggExpr: "avg_balance > 0" + description: "Average balance per status"''', 'Group by validation'), + ], + tags=['validation', 'aggregation', 'group-by', 'statistical'], + )) + + # Upstream validation + features.append(make_feature( + id='validation.upstream', + name='Upstream Cross-Source Validation', + category='validation', + subcategory='cross_source_validations', + status='stable', + description='Validate data by joining with an upstream data source. Enables cross-system data consistency checks.', + configuration=[ + make_config_option('upstreamDataSource', 'string', 'Upstream data source name', required=True, scope='step', yaml_path='validations[].upstreamDataSource'), + make_config_option('upstreamReadOptions', 'object', 'Read options for upstream source', scope='step', yaml_path='validations[].upstreamReadOptions'), + make_config_option('joinFields', 'array', 'Fields to join on', scope='step', yaml_path='validations[].joinFields'), + make_config_option('joinType', 'enum', 'Join type', default='outer', scope='step', yaml_path='validations[].joinType', valid_values=['inner', 'left', 'right', 'full', 'anti', 'semi']), + ], + examples=[ + make_example('yaml', '''validations: + - upstreamDataSource: "source_json" + joinFields: ["account_id"] + joinType: "outer" + validations: + - expr: "source_json_name == name"''', 'Cross-source validation'), + ], + tags=['validation', 'upstream', 'cross-source', 'join'], + )) + + # Field names validation + features.append(make_feature( + id='validation.field_names', + name='Schema Field Names Validation', + category='validation', + subcategory='schema_validations', + status='stable', + description='Validate the schema structure by checking field/column names, counts, and ordering.', + configuration=[ + make_config_option('names', 'array', 'Expected field names', scope='step', yaml_path='validations[].names'), + make_config_option('fieldNameType', 'enum', 'Validation type for field names', scope='step', valid_values=['fieldCountEqual', 'fieldCountBetween', 'fieldNameMatchOrder', 'fieldNameMatchSet']), + make_config_option('count', 'integer', 'Expected exact field count', scope='step'), + make_config_option('min', 'integer', 'Minimum field count', scope='step'), + make_config_option('max', 'integer', 'Maximum field count', scope='step'), + ], + tags=['validation', 'schema', 'field-names', 'structure'], + )) + + # Wait conditions + features.append(make_feature( + id='validation.wait_condition', + name='Wait Conditions', + category='validation', + subcategory='wait_conditions', + status='stable', + description='Define conditions to wait for before running validations. Supports pause, file existence, data existence, and webhook checks.', + configuration=[ + make_config_option('type', 'enum', 'Wait condition type', scope='step', yaml_path='validations[].waitCondition.type', valid_values=['pause', 'fileExists', 'dataExists', 'webhook']), + make_config_option('pauseInSeconds', 'integer', 'Seconds to pause', scope='step'), + make_config_option('path', 'string', 'File path to wait for', scope='step'), + make_config_option('url', 'string', 'Webhook URL', scope='step'), + make_config_option('method', 'enum', 'HTTP method for webhook', scope='step', valid_values=['GET', 'POST', 'PUT', 'DELETE']), + make_config_option('statusCodes', 'array', 'Expected HTTP status codes', scope='step'), + make_config_option('maxRetries', 'integer', 'Maximum retry attempts', scope='step'), + make_config_option('waitBeforeRetrySeconds', 'integer', 'Seconds between retries', scope='step'), + ], + tags=['validation', 'wait', 'condition', 'async'], + )) + + return features + + +def extract_configuration_features(constants_content: str, config_content: str) -> list[dict]: + """Extract configuration flag and setting features.""" + features = [] + + # Parse FlagsConfig case class + flags = parse_scala_case_class(config_content, 'FlagsConfig') + + flag_descriptions = { + 'enableCount': ('Count Records', 'Count the number of records generated for each data source step.', 'ENABLE_COUNT'), + 'enableGenerateData': ('Generate Data', 'Enable or disable data generation. When false, only validation runs.', 'ENABLE_GENERATE_DATA'), + 'enableRecordTracking': ('Record Tracking', 'Track generated records for later cleanup/deletion.', 'ENABLE_RECORD_TRACKING'), + 'enableDeleteGeneratedRecords': ('Delete Generated Records', 'Enable cleanup mode to delete previously generated records.', 'ENABLE_DELETE_GENERATED_RECORDS'), + 'enableGeneratePlanAndTasks': ('Auto-Generate Plan and Tasks', 'Automatically generate plan and tasks from metadata sources.', 'ENABLE_GENERATE_PLAN_AND_TASKS'), + 'enableFailOnError': ('Fail on Error', 'Fail execution immediately when errors occur.', 'ENABLE_FAIL_ON_ERROR'), + 'enableUniqueCheck': ('Unique Check', 'Validate uniqueness constraints during data generation.', 'ENABLE_UNIQUE_CHECK'), + 'enableSinkMetadata': ('Sink Metadata', 'Save metadata about generated data to the sink.', 'ENABLE_SINK_METADATA'), + 'enableSaveReports': ('Save Reports', 'Generate and save execution reports with generation and validation results.', 'ENABLE_SAVE_REPORTS'), + 'enableValidation': ('Data Validation', 'Run data validations after generation completes.', 'ENABLE_VALIDATION'), + 'enableGenerateValidations': ('Suggest Validations', 'Auto-suggest validations based on data analysis.', 'ENABLE_SUGGEST_VALIDATIONS'), + 'enableAlerts': ('Alerts', 'Send alert notifications on completion (supports Slack).', 'ENABLE_ALERTS'), + 'enableUniqueCheckOnlyInBatch': ('Unique Check Only In Batch', 'Check uniqueness only within the current batch for better performance.', 'ENABLE_UNIQUE_CHECK_ONLY_WITHIN_BATCH'), + 'enableFastGeneration': ('Fast Generation', 'Use SQL-based generation for regex patterns instead of UDFs. Dramatically improves performance.', 'ENABLE_FAST_GENERATION'), + } + + for flag in flags: + name = flag['name'] + if name in flag_descriptions: + display_name, desc, env_const = flag_descriptions[name] + default_val = flag.get('default', '') + # Resolve default references + default_map = { + 'DEFAULT_ENABLE_COUNT': True, 'DEFAULT_ENABLE_GENERATE_DATA': True, + 'DEFAULT_ENABLE_RECORD_TRACKING': False, 'DEFAULT_ENABLE_DELETE_GENERATED_RECORDS': False, + 'DEFAULT_ENABLE_GENERATE_PLAN_AND_TASKS': False, 'DEFAULT_ENABLE_FAIL_ON_ERROR': True, + 'DEFAULT_ENABLE_UNIQUE_CHECK': False, 'DEFAULT_ENABLE_SINK_METADATA': False, + 'DEFAULT_ENABLE_SAVE_REPORTS': True, 'DEFAULT_ENABLE_VALIDATION': True, + 'DEFAULT_ENABLE_SUGGEST_VALIDATIONS': False, 'DEFAULT_ENABLE_ALERTS': True, + 'DEFAULT_ENABLE_UNIQUE_CHECK_ONLY_WITHIN_BATCH': False, 'DEFAULT_ENABLE_FAST_GENERATION': False, + } + resolved_default = default_map.get(str(default_val), default_val) + + features.append(make_feature( + id=make_feature_id('configuration', 'flags', name), + name=display_name, + category='configuration', + subcategory='flags', + status='stable', + description=desc, + configuration=[ + make_config_option(name, 'boolean', desc, default=resolved_default, scope='global', + yaml_path=f'config.flags.{name}', env_var=env_const), + ], + tags=['configuration', 'flag', name.replace('enable', '').lower()], + source_files=[ + {'path': 'api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala', 'role': 'primary'}, + ], + )) + + # Folder configuration + folders = parse_scala_case_class(config_content, 'FoldersConfig') + for folder in folders: + features.append(make_feature( + id=make_feature_id('configuration', 'folders', folder['name']), + name=folder['name'].replace('FolderPath', ' Folder').replace('FilePath', ' File').replace('Path', ''), + category='configuration', + subcategory='folders', + status='stable', + description=f'Configuration path for {folder["name"]}.', + configuration=[ + make_config_option(folder['name'], 'string', f'Path setting for {folder["name"]}', scope='global', + yaml_path=f'config.folders.{folder["name"]}'), + ], + tags=['configuration', 'folder', 'path'], + )) + + # Generation config + features.append(make_feature( + id='configuration.generation.batch_size', + name='Batch Size', + category='configuration', + subcategory='generation', + status='stable', + description='Control the number of records generated per batch. Affects memory usage and performance.', + configuration=[ + make_config_option('numRecordsPerBatch', 'long', 'Records per batch', default=100000, scope='global', + yaml_path='config.generation.numRecordsPerBatch'), + make_config_option('numRecordsPerStep', 'long', 'Default records per step/table', scope='global', + yaml_path='config.generation.numRecordsPerStep'), + ], + tags=['configuration', 'generation', 'batch', 'performance'], + )) + + features.append(make_feature( + id='configuration.generation.bloom_filter', + name='Bloom Filter Configuration', + category='configuration', + subcategory='generation', + status='stable', + description='Configure bloom filter parameters for uniqueness checking during generation.', + configuration=[ + make_config_option('uniqueBloomFilterNumItems', 'long', 'Expected number of items in bloom filter', default=10000000, scope='global', + yaml_path='config.generation.uniqueBloomFilterNumItems'), + make_config_option('uniqueBloomFilterFalsePositiveProbability', 'double', 'Bloom filter false positive rate (0-1)', default=0.01, scope='global', + yaml_path='config.generation.uniqueBloomFilterFalsePositiveProbability', + range={'min': 0, 'max': 1}), + ], + tags=['configuration', 'generation', 'bloom-filter', 'uniqueness'], + )) + + # Metadata config + features.append(make_feature( + id='configuration.metadata', + name='Metadata Analysis Configuration', + category='configuration', + subcategory='metadata', + status='stable', + description='Configure how metadata is sampled and analyzed for auto-generation of field patterns.', + configuration=[ + make_config_option('numRecordsFromDataSource', 'integer', 'Sample size from data source', default=10000, scope='global', + yaml_path='config.metadata.numRecordsFromDataSource'), + make_config_option('numRecordsForAnalysis', 'integer', 'Records analyzed for pattern detection', default=10000, scope='global', + yaml_path='config.metadata.numRecordsForAnalysis'), + make_config_option('oneOfDistinctCountVsCountThreshold', 'double', 'Threshold for detecting oneOf fields', default=0.2, scope='global', + yaml_path='config.metadata.oneOfDistinctCountVsCountThreshold'), + make_config_option('oneOfMinCount', 'long', 'Minimum records for oneOf detection', default=1000, scope='global', + yaml_path='config.metadata.oneOfMinCount'), + make_config_option('numGeneratedSamples', 'integer', 'Number of sample records in metadata suggestions', default=10, scope='global', + yaml_path='config.metadata.numGeneratedSamples'), + ], + tags=['configuration', 'metadata', 'analysis', 'sampling'], + )) + + # Streaming config + features.append(make_feature( + id='configuration.streaming', + name='Streaming Configuration', + category='configuration', + subcategory='streaming', + status='stable', + description='Configure streaming/real-time data generation parameters.', + configuration=[ + make_config_option('maxTimeoutSeconds', 'integer', 'Maximum streaming timeout', default=3600, scope='global'), + make_config_option('maxAsyncParallelism', 'integer', 'Maximum async parallelism', default=100, scope='global'), + make_config_option('responseBufferSize', 'integer', 'Response buffer size for streaming', default=10000, scope='global'), + make_config_option('timestampWindowMs', 'long', 'Timestamp window in milliseconds', default=1000, scope='global'), + ], + tags=['configuration', 'streaming', 'real-time', 'performance'], + )) + + # Alert config + features.append(make_feature( + id='configuration.alerts', + name='Alert Configuration', + category='configuration', + subcategory='alerts', + status='stable', + description='Configure alert notifications triggered on execution completion. Supports Slack integration.', + configuration=[ + make_config_option('triggerOn', 'enum', 'When to trigger alerts', default='all', scope='global', + yaml_path='config.alert.triggerOn', + valid_values=['all', 'failure', 'success', 'generation_failure', 'validation_failure', 'generation_success', 'validation_success']), + make_config_option('slackToken', 'string', 'Slack API token', scope='global', yaml_path='config.alert.slackToken'), + make_config_option('slackChannels', 'array', 'Slack channels to notify', scope='global', yaml_path='config.alert.slackChannels'), + ], + tags=['configuration', 'alert', 'notification', 'slack'], + )) + + # Validation runtime config + features.append(make_feature( + id='configuration.validation', + name='Validation Runtime Configuration', + category='configuration', + subcategory='validation_runtime', + status='stable', + description='Configure validation execution behavior.', + configuration=[ + make_config_option('numSampleErrorRecords', 'integer', 'Number of sample error records in reports', default=5, scope='global', + yaml_path='config.validation.numSampleErrorRecords'), + make_config_option('enableDeleteRecordTrackingFiles', 'boolean', 'Delete tracking files after validation', default=True, scope='global', + yaml_path='config.validation.enableDeleteRecordTrackingFiles'), + ], + tags=['configuration', 'validation', 'runtime'], + )) + + # Spark runtime config + features.append(make_feature( + id='configuration.runtime.spark', + name='Apache Spark Configuration', + category='configuration', + subcategory='runtime', + status='stable', + description='Configure the Apache Spark runtime for data processing. Set master URL, driver/executor memory, and Spark SQL settings.', + configuration=[ + make_config_option('master', 'string', 'Spark master URL', default='local[*]', scope='global', yaml_path='config.runtime.master'), + make_config_option('sparkConfig', 'object', 'Spark configuration key-value pairs', scope='global', yaml_path='config.runtime.sparkConfig'), + ], + examples=[ + make_example('yaml', '''config: + runtime: + master: "local[4]" + sparkConfig: + "spark.driver.memory": "4g" + "spark.sql.shuffle.partitions": "10"''', 'Spark configuration'), + ], + tags=['configuration', 'runtime', 'spark', 'performance'], + )) + + # Sink options + features.append(make_feature( + id='configuration.sink_options', + name='Global Sink Options', + category='configuration', + subcategory='sink', + status='stable', + description='Global options for data output: random seed for reproducibility and locale for data generation.', + configuration=[ + make_config_option('seed', 'string', 'Random seed for reproducible generation', scope='global', yaml_path='sinkOptions.seed'), + make_config_option('locale', 'string', 'Locale for data generation (affects names, addresses)', scope='global', yaml_path='sinkOptions.locale'), + ], + examples=[ + make_example('yaml', '''sinkOptions: + seed: "42" + locale: "en-US"''', 'Sink options'), + ], + tags=['configuration', 'sink', 'seed', 'locale'], + )) + + return features + + +def extract_advanced_features() -> list[dict]: + """Extract advanced features: foreign keys, count, streaming, transformation, metadata sources.""" + features = [] + + # Foreign key relationships + features.append(make_feature( + id='advanced.foreign_keys', + name='Foreign Key Relationships', + category='advanced', + subcategory='referential_integrity', + status='stable', + description='Define foreign key relationships between data sources to maintain referential integrity. Supports composite keys, cardinality control, nullability, and multiple generation modes.', + configuration=[ + make_config_option('source', 'object', 'Source table containing primary key', required=True, scope='plan', yaml_path='foreignKeys[].source'), + make_config_option('generate', 'array', 'Target tables with foreign key references', scope='plan', yaml_path='foreignKeys[].generate'), + make_config_option('delete', 'array', 'Target tables for cleanup', scope='plan', yaml_path='foreignKeys[].delete'), + ], + examples=[ + make_example('yaml', '''foreignKeys: + - source: + dataSource: postgres_db + step: customers + fields: ["customer_id"] + generate: + - dataSource: postgres_db + step: orders + fields: ["customer_id"] + cardinality: + min: 1 + max: 10 + distribution: "uniform"''', 'Foreign key with cardinality'), + ], + tags=['advanced', 'foreign-key', 'referential-integrity', 'relationship'], + )) + + # Foreign key cardinality + features.append(make_feature( + id='advanced.foreign_key_cardinality', + name='Foreign Key Cardinality Control', + category='advanced', + subcategory='referential_integrity', + status='stable', + description='Control the cardinality of foreign key relationships. Set min/max records per parent, ratio multipliers, and distribution patterns.', + configuration=[ + make_config_option('min', 'integer', 'Minimum records per parent key', scope='plan', yaml_path='foreignKeys[].generate[].cardinality.min'), + make_config_option('max', 'integer', 'Maximum records per parent key', scope='plan', yaml_path='foreignKeys[].generate[].cardinality.max'), + make_config_option('ratio', 'double', 'Ratio multiplier (e.g., 10.0 = 10x parent records)', scope='plan', yaml_path='foreignKeys[].generate[].cardinality.ratio'), + make_config_option('distribution', 'enum', 'Cardinality distribution', default='uniform', scope='plan', + yaml_path='foreignKeys[].generate[].cardinality.distribution', + valid_values=['uniform', 'normal', 'zipf', 'power']), + ], + tags=['advanced', 'cardinality', 'distribution', 'foreign-key'], + )) + + # Foreign key nullability + features.append(make_feature( + id='advanced.foreign_key_nullability', + name='Foreign Key Nullability', + category='advanced', + subcategory='referential_integrity', + status='stable', + description='Control null value injection in foreign key fields. Configure percentage of nulls and distribution strategy (random, head, tail).', + configuration=[ + make_config_option('nullPercentage', 'double', 'Percentage of null values (0-1)', scope='plan', + yaml_path='foreignKeys[].generate[].nullability.nullPercentage', + range={'min': 0, 'max': 1}), + make_config_option('strategy', 'enum', 'Null distribution strategy', default='random', scope='plan', + yaml_path='foreignKeys[].generate[].nullability.strategy', + valid_values=['random', 'leading', 'trailing']), + ], + tags=['advanced', 'nullability', 'foreign-key', 'null'], + )) + + # Foreign key generation modes + features.append(make_feature( + id='advanced.foreign_key_generation_modes', + name='Foreign Key Generation Modes', + category='advanced', + subcategory='referential_integrity', + status='stable', + description='Control how foreign key values are generated. "all-exist" ensures all records have valid FKs, "all-combinations" generates all possible combinations, "partial" creates a mix of valid and invalid references.', + configuration=[ + make_config_option('generationMode', 'enum', 'FK generation strategy', default='all-exist', scope='plan', + valid_values=['all-exist', 'all-combinations', 'partial']), + ], + use_cases=[ + 'all-exist: Standard referential integrity testing', + 'all-combinations: Comprehensive join testing with all possible combinations', + 'partial: Testing handling of orphan records and broken references', + ], + tags=['advanced', 'foreign-key', 'generation-mode'], + )) + + # Record count configuration + features.append(make_feature( + id='advanced.count', + name='Record Count Configuration', + category='advanced', + subcategory='count', + status='stable', + description='Configure how many records to generate per step. Supports fixed count, per-field distribution, and streaming rate-based generation.', + configuration=[ + make_config_option('records', 'integer', 'Total records to generate', default=1000, scope='step', yaml_path='dataSources[].steps[].count.records'), + make_config_option('perField', 'object', 'Generate records per unique field value', scope='step', yaml_path='dataSources[].steps[].count.perField'), + ], + examples=[ + make_example('yaml', '''count: + records: 5000''', 'Fixed count'), + make_example('yaml', '''count: + records: 100 + perField: + fieldNames: ["account_id"] + options: + min: 1 + max: 5''', 'Per-field count distribution'), + ], + tags=['advanced', 'count', 'records', 'distribution'], + )) + + # Streaming / load patterns + features.append(make_feature( + id='advanced.streaming_load_patterns', + name='Streaming Load Patterns', + category='advanced', + subcategory='streaming', + status='stable', + description='Define time-based data generation patterns for streaming scenarios. Supports ramp, spike, sine, and custom step patterns.', + configuration=[ + make_config_option('duration', 'string', 'Streaming duration (e.g., 10m, 1h)', scope='step', yaml_path='dataSources[].steps[].count.duration'), + make_config_option('rate', 'integer', 'Records per time unit', scope='step', yaml_path='dataSources[].steps[].count.rate'), + make_config_option('rateUnit', 'enum', 'Time unit for rate', scope='step', yaml_path='dataSources[].steps[].count.rateUnit', valid_values=['second', 'minute', 'hour']), + make_config_option('pattern.type', 'enum', 'Load pattern type', scope='step', yaml_path='dataSources[].steps[].count.pattern.type', valid_values=['ramp', 'spike', 'sine', 'steps']), + make_config_option('pattern.startRate', 'integer', 'Starting rate for ramp pattern', scope='step'), + make_config_option('pattern.endRate', 'integer', 'Ending rate for ramp pattern', scope='step'), + make_config_option('pattern.baseRate', 'integer', 'Base rate for spike pattern', scope='step'), + make_config_option('pattern.spikeRate', 'integer', 'Spike rate', scope='step'), + make_config_option('pattern.amplitude', 'integer', 'Amplitude for sine pattern', scope='step'), + make_config_option('pattern.frequency', 'double', 'Frequency for sine pattern', scope='step'), + make_config_option('pattern.steps', 'array', 'Custom step definitions with rate and duration', scope='step'), + ], + examples=[ + make_example('yaml', '''count: + duration: "1m" + rate: 100 + rateUnit: "second" + pattern: + type: "ramp" + startRate: 10 + endRate: 200''', 'Ramp load pattern'), + ], + tags=['advanced', 'streaming', 'load-pattern', 'rate'], + )) + + # Transformation + features.append(make_feature( + id='advanced.transformation', + name='Post-Generation Transformation', + category='advanced', + subcategory='transformation', + status='stable', + description='Apply custom Java/Scala transformations to generated data before writing to output. Supports whole-file and row-by-row modes.', + configuration=[ + make_config_option('className', 'string', 'Fully qualified transformation class name', required=True, scope='step', yaml_path='dataSources[].steps[].transformation.className'), + make_config_option('methodName', 'string', 'Method to call', default='transform', scope='step'), + make_config_option('mode', 'enum', 'Transformation mode', scope='step', valid_values=['whole-file', 'row-by-row']), + make_config_option('outputPath', 'string', 'Output directory', scope='step'), + make_config_option('deleteOriginal', 'boolean', 'Delete input after transformation', scope='step'), + make_config_option('enabled', 'boolean', 'Enable/disable transformation', default=True, scope='step'), + ], + tags=['advanced', 'transformation', 'custom', 'plugin'], + )) + + # Step options + features.append(make_feature( + id='advanced.step_options', + name='Step Field Filtering', + category='advanced', + subcategory='step_options', + status='stable', + description='Include or exclude fields from metadata-driven generation using exact names or patterns.', + configuration=[ + make_config_option('includeFields', 'array', 'List of field names to include', scope='step', scala_constant='INCLUDE_FIELDS'), + make_config_option('excludeFields', 'array', 'List of field names to exclude', scope='step', scala_constant='EXCLUDE_FIELDS'), + make_config_option('includeFieldPatterns', 'array', 'Regex patterns for fields to include', scope='step', scala_constant='INCLUDE_FIELD_PATTERNS'), + make_config_option('excludeFieldPatterns', 'array', 'Regex patterns for fields to exclude', scope='step', scala_constant='EXCLUDE_FIELD_PATTERNS'), + make_config_option('allCombinations', 'boolean', 'Generate all field value combinations', scope='step', scala_constant='ALL_COMBINATIONS'), + ], + tags=['advanced', 'step', 'filtering', 'metadata'], + )) + + # Metadata sources + metadata_sources = [ + ('marquez', 'Marquez', 'Apache Marquez open-source metadata service with OpenLineage support.'), + ('open_metadata', 'OpenMetadata', 'OpenMetadata platform for metadata discovery. Supports multiple auth types (basic, Azure, Google, Okta, Auth0, AWS Cognito).'), + ('open_api', 'OpenAPI/Swagger', 'Generate data from OpenAPI/Swagger specifications.'), + ('great_expectations', 'Great Expectations', 'Import data quality expectations from Great Expectations suites.'), + ('open_data_contract_standard', 'Open Data Contract Standard', 'Import schemas from ODCS format.'), + ('data_contract_cli', 'Data Contract CLI', 'Import schemas from Data Contract CLI format.'), + ('amundsen', 'Amundsen', 'Import metadata from Amundsen data catalog.'), + ('datahub', 'DataHub', 'Import metadata from DataHub data catalog.'), + ('confluent_schema_registry', 'Confluent Schema Registry', 'Import schemas from Confluent Schema Registry (Avro, Protobuf, JSON Schema).'), + ('json_schema', 'JSON Schema', 'Generate data from JSON Schema definitions.'), + ] + + for src_id, name, desc in metadata_sources: + features.append(make_feature( + id=make_feature_id('metadata', 'source', src_id), + name=f'{name} Integration', + category='metadata', + subcategory='sources', + status='stable', + description=desc, + tags=['metadata', 'integration', src_id.replace('_', '-')], + )) + + # Reference mode + features.append(make_feature( + id='advanced.reference_mode', + name='Reference Mode', + category='advanced', + subcategory='reference', + status='stable', + description='Load existing data as reference for foreign key relationships instead of generating new data. Useful when you need realistic FK values from existing datasets.', + configuration=[ + make_config_option('enableReferenceMode', 'boolean', 'Enable reference mode for this data source', default=False, scope='datasource'), + make_config_option('enableDataGeneration', 'boolean', 'Disable generation (use with reference mode)', default=True, scope='datasource'), + ], + tags=['advanced', 'reference', 'existing-data', 'foreign-key'], + )) + + # Plan and run interfaces + features.append(make_feature( + id='advanced.interfaces', + name='Configuration Interfaces', + category='advanced', + subcategory='interfaces', + status='stable', + description='Data Caterer supports multiple configuration interfaces: Java API, Scala API, YAML configuration, and Web UI.', + use_cases=[ + 'Java API: Programmatic configuration from Java applications', + 'Scala API: Programmatic configuration with Scala builders', + 'YAML: Declarative configuration for CI/CD and automation', + 'Web UI: Visual configuration and execution management', + ], + tags=['advanced', 'interface', 'api', 'yaml', 'ui'], + )) + + # Environment variable substitution + features.append(make_feature( + id='advanced.env_substitution', + name='Environment Variable Substitution', + category='advanced', + subcategory='configuration', + status='stable', + description='Use ${VAR_NAME} syntax in YAML configuration to substitute environment variables at runtime. Supports default values with ${VAR:-default}.', + examples=[ + make_example('yaml', '''options: + password: "${DB_PASSWORD}" + url: "${KAFKA_BROKERS:-localhost:9092}"''', 'Environment variable substitution'), + ], + tags=['advanced', 'environment', 'variable', 'secrets'], + )) + + return features + + +def extract_ui_features() -> list[dict]: + """Extract UI and API features.""" + features = [] + + ui_features = [ + ('ui.connection_management', 'Connection Management', 'Create, edit, test, and manage data source connections through the web UI.'), + ('ui.plan_creation', 'Interactive Plan Creation', 'Build data generation plans interactively with visual field configuration.'), + ('ui.execution_history', 'Execution History', 'View past execution runs with status, timing, and record counts.'), + ('ui.results_viewing', 'Real-time Results', 'View generation and validation results in real-time during execution.'), + ('ui.sample_data', 'Sample Data Generation', 'Preview generated sample data before running full generation.'), + ('ui.report_generation', 'Report Generation', 'Generate detailed HTML reports with generation statistics and validation results.'), + ] + + for fid, name, desc in ui_features: + features.append(make_feature( + id=fid, + name=name, + category='ui_api', + subcategory='web_ui', + status='stable', + description=desc, + tags=['ui', 'web', fid.split('.')[-1].replace('_', '-')], + source_files=[ + {'path': 'app/src/main/scala/io/github/datacatering/datacaterer/core/ui/', 'role': 'primary'}, + ], + )) + + return features + + +def build_catalog() -> dict: + """Build the complete feature catalog.""" + print("Reading source files...") + constants_content = read_file(CONSTANTS_PATH) + config_content = read_file(CONFIG_MODELS_PATH) + + print("Extracting features...") + all_features = [] + + # Data source connectors + constants_vals = parse_scala_lazy_vals(constants_content) + all_features.extend(extract_data_source_features(constants_vals)) + print(f" Connectors: {len([f for f in all_features if f['category'] == 'connectors'])}") + + # Field generation + all_features.extend(extract_field_generation_features()) + all_features.extend(extract_field_option_features()) + all_features.extend(extract_field_label_features()) + gen_count = len([f for f in all_features if f['category'] == 'generation']) + print(f" Generation: {gen_count}") + + # Validation + all_features.extend(extract_validation_features()) + val_count = len([f for f in all_features if f['category'] == 'validation']) + print(f" Validation: {val_count}") + + # Configuration + all_features.extend(extract_configuration_features(constants_content, config_content)) + config_count = len([f for f in all_features if f['category'] == 'configuration']) + print(f" Configuration: {config_count}") + + # Advanced features + all_features.extend(extract_advanced_features()) + adv_count = len([f for f in all_features if f['category'] in ('advanced', 'metadata')]) + print(f" Advanced + Metadata: {adv_count}") + + # UI features + all_features.extend(extract_ui_features()) + ui_count = len([f for f in all_features if f['category'] == 'ui_api']) + print(f" UI/API: {ui_count}") + + # Build categories summary + category_counts = {} + for f in all_features: + cat = f['category'] + category_counts[cat] = category_counts.get(cat, 0) + 1 + + categories = [ + {'id': cat, 'name': cat.replace('_', ' ').title(), 'featureCount': count} + for cat, count in sorted(category_counts.items()) + ] + + catalog = { + 'project': { + 'name': 'Data Caterer', + 'version': '0.19.0', + 'repository': 'https://github.com/data-catering/data-caterer', + 'lastUpdated': '2026-02-11', + }, + 'categories': categories, + 'features': all_features, + } + + print(f"\nTotal features extracted: {len(all_features)}") + return catalog + + +def main(): + catalog = build_catalog() + output_path = FEATURE_CATALOG_DIR / "features.json" + save_json(catalog, output_path) + print(f"\nFeature catalog saved to: {output_path}") + + +if __name__ == '__main__': + main() diff --git a/misc/feature-catalog/scripts/generate_markdown.py b/misc/feature-catalog/scripts/generate_markdown.py new file mode 100644 index 00000000..ed4459a1 --- /dev/null +++ b/misc/feature-catalog/scripts/generate_markdown.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 +""" +Generate Markdown documentation from the features.json catalog. + +Produces: + docs/index.md - Main overview and navigation + docs/categories/.md - Per-category feature documentation + +Usage: + python scripts/generate_markdown.py +""" + +import sys +from collections import defaultdict +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent)) +from utils import FEATURE_CATALOG_DIR, load_json + + +DOCS_DIR = FEATURE_CATALOG_DIR / "docs" +CATEGORIES_DIR = DOCS_DIR / "categories" + +# Category display order and descriptions +CATEGORY_META = { + 'connectors': { + 'title': 'Data Source Connectors', + 'icon': '', + 'description': 'Data Caterer supports connecting to databases, file systems, messaging systems, and HTTP APIs for reading and writing test data.', + }, + 'generation': { + 'title': 'Data Generation', + 'icon': '', + 'description': 'Comprehensive data generation capabilities including regex patterns, faker expressions, SQL computations, and field-level configuration options.', + }, + 'validation': { + 'title': 'Data Validation', + 'icon': '', + 'description': 'Over 30 validation types for verifying generated data quality, schema compliance, statistical properties, and cross-source consistency.', + }, + 'configuration': { + 'title': 'Configuration', + 'icon': '', + 'description': 'Runtime configuration for controlling generation behavior, validation, performance tuning, alerts, and output paths.', + }, + 'advanced': { + 'title': 'Advanced Features', + 'icon': '', + 'description': 'Foreign key relationships, streaming load patterns, custom transformations, metadata-driven generation, and more.', + }, + 'metadata': { + 'title': 'Metadata Integration', + 'icon': '', + 'description': 'Import schemas and metadata from external catalogs, registries, and standards to drive automatic data generation.', + }, + 'ui_api': { + 'title': 'UI and API', + 'icon': '', + 'description': 'Web-based user interface for visual plan creation, execution management, and result viewing.', + }, +} + +SUBCATEGORY_TITLES = { + 'databases': 'Databases', + 'files': 'File Formats', + 'messaging': 'Messaging Systems', + 'http': 'HTTP/REST', + 'generators': 'Generator Types', + 'data_types': 'Data Types', + 'field_options': 'Field Options', + 'labels': 'Field Labels (Auto-Detection)', + 'field_validations': 'Field-Level Validations', + 'statistical_validations': 'Statistical Validations', + 'expression_validations': 'Expression Validations', + 'aggregation_validations': 'Aggregation Validations', + 'cross_source_validations': 'Cross-Source Validations', + 'schema_validations': 'Schema Validations', + 'wait_conditions': 'Wait Conditions', + 'flags': 'Feature Flags', + 'folders': 'Folder Paths', + 'generation': 'Generation Settings', + 'metadata': 'Metadata Settings', + 'streaming': 'Streaming Settings', + 'alerts': 'Alert Settings', + 'validation_runtime': 'Validation Runtime', + 'runtime': 'Spark Runtime', + 'sink': 'Sink Options', + 'referential_integrity': 'Foreign Key Relationships', + 'count': 'Record Count', + 'transformation': 'Transformation', + 'step_options': 'Step Options', + 'interfaces': 'Interfaces', + 'configuration': 'Configuration', + 'reference': 'Reference Mode', + 'sources': 'Metadata Sources', + 'web_ui': 'Web UI Features', +} + + +def generate_config_table(config_options: list[dict]) -> str: + """Generate a Markdown table for configuration options.""" + if not config_options: + return "" + + lines = [ + "", + "| Option | Type | Required | Default | Description |", + "|--------|------|----------|---------|-------------|", + ] + for opt in config_options: + name = f"`{opt['name']}`" + type_ = opt.get('type', '') + required = 'Yes' if opt.get('required') else 'No' + default = opt.get('default', '-') + if default is None or default == '': + default = '-' + elif isinstance(default, bool): + default = str(default).lower() + desc = opt.get('description', '') + + # Add valid values + valid = opt.get('validValues', []) + if valid: + desc += f" Values: {', '.join(f'`{v}`' for v in valid)}" + + # Add YAML path + yaml_path = opt.get('yamlPath', '') + if yaml_path: + desc += f" YAML: `{yaml_path}`" + + lines.append(f"| {name} | {type_} | {required} | `{default}` | {desc} |") + + lines.append("") + return "\n".join(lines) + + +def generate_examples(examples: list[dict]) -> str: + """Generate Markdown code blocks for examples.""" + if not examples: + return "" + + parts = [] + for ex in examples: + title = ex.get('title', '') + desc = ex.get('description', '') + fmt = ex.get('format', 'yaml') + code = ex.get('code', '') + + if title: + parts.append(f"\n**{title}**:") + elif desc: + parts.append(f"\n**{desc}**:") + + parts.append(f"```{fmt}") + parts.append(code.strip()) + parts.append("```") + + return "\n".join(parts) + + +def generate_feature_section(feature: dict) -> str: + """Generate Markdown for a single feature.""" + lines = [] + + # Feature header + lines.append(f"### {feature['name']}") + lines.append("") + lines.append(f"**ID**: `{feature['id']}`") + lines.append(f"**Status**: {feature.get('status', 'stable').title()}") + lines.append("") + + # Description + lines.append(feature.get('description', '')) + lines.append("") + + # Use cases + use_cases = feature.get('useCases', []) + if use_cases: + lines.append("**Use Cases**:") + for uc in use_cases: + lines.append(f"- {uc}") + lines.append("") + + # Configuration table + config = feature.get('configuration', []) + if config: + lines.append("**Configuration**:") + lines.append(generate_config_table(config)) + + # Examples + examples = feature.get('examples', []) + if examples: + lines.append("**Examples**:") + lines.append(generate_examples(examples)) + lines.append("") + + # Source files + sources = feature.get('sourceFiles', []) + if sources: + lines.append("**Source Files**:") + for src in sources: + role = src.get('role', '') + lines.append(f"- `{src['path']}` ({role})") + lines.append("") + + # Related features + related = feature.get('relatedFeatures', []) + if related: + lines.append("**Related Features**:") + for r in related: + lines.append(f"- `{r}`") + lines.append("") + + # Tags + tags = feature.get('tags', []) + if tags: + lines.append(f"**Tags**: {', '.join(f'`{t}`' for t in tags)}") + lines.append("") + + # Performance notes + perf = feature.get('performanceNotes', []) + if perf: + lines.append("**Performance Notes**:") + for p in perf: + lines.append(f"- {p}") + lines.append("") + + # Limitations + limits = feature.get('limitations', []) + if limits: + lines.append("**Known Limitations**:") + for l in limits: + lines.append(f"- {l}") + lines.append("") + + lines.append("---") + lines.append("") + + return "\n".join(lines) + + +def generate_category_page(category_id: str, features: list[dict]) -> str: + """Generate a full Markdown page for a category.""" + meta = CATEGORY_META.get(category_id, {}) + title = meta.get('title', category_id.replace('_', ' ').title()) + description = meta.get('description', '') + + lines = [] + lines.append(f"# {title}") + lines.append("") + if description: + lines.append(description) + lines.append("") + + lines.append(f"**{len(features)} features** in this category.") + lines.append("") + + # Table of contents + lines.append("## Table of Contents") + lines.append("") + + # Group by subcategory + by_sub = defaultdict(list) + for f in features: + sub = f.get('subcategory', 'other') + by_sub[sub].append(f) + + for sub, sub_features in by_sub.items(): + sub_title = SUBCATEGORY_TITLES.get(sub, sub.replace('_', ' ').title()) + anchor = sub.lower().replace(' ', '-').replace('_', '-') + lines.append(f"- [{sub_title}](#{anchor}) ({len(sub_features)} features)") + + lines.append("") + + # Feature sections grouped by subcategory + for sub, sub_features in by_sub.items(): + sub_title = SUBCATEGORY_TITLES.get(sub, sub.replace('_', ' ').title()) + lines.append(f"## {sub_title}") + lines.append("") + + for f in sub_features: + lines.append(generate_feature_section(f)) + + return "\n".join(lines) + + +def generate_index(catalog: dict) -> str: + """Generate the main index.md page.""" + project = catalog.get('project', {}) + categories = catalog.get('categories', []) + features = catalog.get('features', []) + + lines = [] + lines.append(f"# {project.get('name', 'Project')} Feature Catalog") + lines.append("") + lines.append(f"Complete reference for all features in {project.get('name', 'the project')}.") + lines.append("") + lines.append(f"- **Version**: {project.get('version', 'N/A')}") + lines.append(f"- **Total Features**: {len(features)}") + lines.append(f"- **Last Updated**: {project.get('lastUpdated', 'N/A')}") + lines.append(f"- **Repository**: {project.get('repository', 'N/A')}") + lines.append("") + + # Categories overview + lines.append("## Categories") + lines.append("") + + # Order categories + order = ['connectors', 'generation', 'validation', 'configuration', 'advanced', 'metadata', 'ui_api'] + sorted_cats = sorted(categories, key=lambda c: order.index(c['id']) if c['id'] in order else 99) + + for cat in sorted_cats: + meta = CATEGORY_META.get(cat['id'], {}) + title = meta.get('title', cat['name']) + desc = meta.get('description', '') + count = cat.get('featureCount', 0) + filename = cat['id'].replace('_', '-') + lines.append(f"### [{title}](categories/{filename}.md)") + lines.append(f"{count} features | {desc}") + lines.append("") + + # Quick reference by status + lines.append("## Feature Summary") + lines.append("") + lines.append("| Category | Features | Description |") + lines.append("|----------|----------|-------------|") + for cat in sorted_cats: + meta = CATEGORY_META.get(cat['id'], {}) + title = meta.get('title', cat['name']) + count = cat.get('featureCount', 0) + filename = cat['id'].replace('_', '-') + short_desc = meta.get('description', '')[:80] + if len(meta.get('description', '')) > 80: + short_desc += '...' + lines.append(f"| [{title}](categories/{filename}.md) | {count} | {short_desc} |") + lines.append("") + + total = sum(c.get('featureCount', 0) for c in sorted_cats) + lines.append(f"**Total: {total} features**") + lines.append("") + + # All features alphabetical index + lines.append("## All Features (Alphabetical)") + lines.append("") + + sorted_features = sorted(features, key=lambda f: f['name'].lower()) + for f in sorted_features: + cat_id = f['category'].replace('_', '-') + sub = f.get('subcategory', '') + anchor = f['name'].lower().replace(' ', '-').replace('/', '-').replace('(', '').replace(')', '') + lines.append(f"- [{f['name']}](categories/{cat_id}.md#{anchor}) - `{f['id']}`") + + lines.append("") + + return "\n".join(lines) + + +def main(): + print("Loading features.json...") + catalog_path = FEATURE_CATALOG_DIR / "features.json" + catalog = load_json(catalog_path) + + features = catalog.get('features', []) + print(f"Loaded {len(features)} features") + + # Create directories + DOCS_DIR.mkdir(parents=True, exist_ok=True) + CATEGORIES_DIR.mkdir(parents=True, exist_ok=True) + + # Generate index + print("Generating index.md...") + index_content = generate_index(catalog) + (DOCS_DIR / "index.md").write_text(index_content) + + # Group features by category + by_category = defaultdict(list) + for f in features: + by_category[f['category']].append(f) + + # Generate category pages + for cat_id, cat_features in by_category.items(): + filename = cat_id.replace('_', '-') + '.md' + print(f"Generating categories/{filename} ({len(cat_features)} features)...") + content = generate_category_page(cat_id, cat_features) + (CATEGORIES_DIR / filename).write_text(content) + + print(f"\nDocumentation generated in: {DOCS_DIR}") + print(f" - index.md") + for cat_id in by_category: + filename = cat_id.replace('_', '-') + '.md' + print(f" - categories/{filename}") + + +if __name__ == '__main__': + main() diff --git a/misc/feature-catalog/scripts/utils.py b/misc/feature-catalog/scripts/utils.py new file mode 100644 index 00000000..a124a38a --- /dev/null +++ b/misc/feature-catalog/scripts/utils.py @@ -0,0 +1,263 @@ +""" +Utility functions for feature extraction from source code. + +Generic utilities designed to parse different source formats and produce +standardized feature metadata. Can be adapted for other projects by +replacing language-specific parsers. +""" + +import json +import os +import re +from pathlib import Path +from typing import Any + + +# Project root - adjust for your project +PROJECT_ROOT = Path(__file__).parent.parent.parent.parent +FEATURE_CATALOG_DIR = Path(__file__).parent.parent + + +def load_json(path: Path) -> dict: + """Load a JSON file.""" + with open(path, 'r') as f: + return json.load(f) + + +def save_json(data: dict, path: Path, indent: int = 2): + """Save data as formatted JSON.""" + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, 'w') as f: + json.dump(data, f, indent=indent, ensure_ascii=False) + print(f"Saved: {path}") + + +def read_file(path: Path) -> str: + """Read a text file.""" + with open(path, 'r') as f: + return f.read() + + +def parse_scala_lazy_vals(content: str) -> list[dict]: + """ + Parse Scala lazy val declarations from source code. + + Returns list of dicts with: + - name: Scala variable name + - value: The string/numeric value + - comment: Any preceding comment + """ + results = [] + lines = content.split('\n') + current_comment = "" + + for i, line in enumerate(lines): + stripped = line.strip() + + # Track comments + if stripped.startswith('//'): + current_comment = stripped.lstrip('/ ').strip() + continue + + # Match lazy val declarations with string values + match = re.match( + r'lazy\s+val\s+(\w+)\s*[:=]\s*(?:String\s*=\s*)?["\'](.+?)["\']', + stripped + ) + if match: + results.append({ + 'name': match.group(1), + 'value': match.group(2), + 'comment': current_comment, + 'line': i + 1, + }) + current_comment = "" + continue + + # Match lazy val with numeric values + match = re.match( + r'lazy\s+val\s+(\w+)\s*[:=]\s*(?:\w+\s*=\s*)?(-?[\d.]+)L?', + stripped + ) + if match: + val = match.group(2) + results.append({ + 'name': match.group(1), + 'value': float(val) if '.' in val else int(val), + 'comment': current_comment, + 'line': i + 1, + }) + current_comment = "" + continue + + # Match lazy val with boolean values + match = re.match( + r'lazy\s+val\s+(\w+)\s*[:=]\s*(?:\w+\s*=\s*)?(true|false)', + stripped + ) + if match: + results.append({ + 'name': match.group(1), + 'value': match.group(2) == 'true', + 'comment': current_comment, + 'line': i + 1, + }) + current_comment = "" + continue + + # Reset comment if line is not a comment and not a lazy val + if stripped and not stripped.startswith('//'): + current_comment = "" + + return results + + +def parse_scala_case_class(content: str, class_name: str) -> list[dict]: + """ + Parse a Scala case class to extract fields with types and defaults. + + Returns list of dicts with: + - name: Field name + - type: Scala type + - default: Default value (string representation) + """ + # Find the case class definition + pattern = rf'case\s+class\s+{class_name}\s*\((.*?)\)' + match = re.search(pattern, content, re.DOTALL) + if not match: + return [] + + body = match.group(1) + fields = [] + + # Split by commas that are not inside brackets + depth = 0 + current = "" + for char in body: + if char in '([{': + depth += 1 + elif char in ')]}': + depth -= 1 + elif char == ',' and depth == 0: + fields.append(current.strip()) + current = "" + continue + current += char + if current.strip(): + fields.append(current.strip()) + + results = [] + for field in fields: + # Parse field: name: Type = default + match = re.match(r'(\w+)\s*:\s*(\S+(?:\[.*?\])?)\s*(?:=\s*(.+))?', field.strip()) + if match: + results.append({ + 'name': match.group(1), + 'type': match.group(2), + 'default': match.group(3).strip() if match.group(3) else None, + }) + + return results + + +def make_feature_id(*parts: str) -> str: + """Create a standardized feature ID from parts.""" + return '.'.join( + re.sub(r'[^a-z0-9_]', '_', part.lower().strip()) + for part in parts + if part + ) + + +def make_feature( + id: str, + name: str, + category: str, + status: str, + description: str, + subcategory: str = "", + configuration: list[dict] | None = None, + examples: list[dict] | None = None, + tags: list[str] | None = None, + related_features: list[str] | None = None, + source_files: list[dict] | None = None, + dependencies: dict | None = None, + use_cases: list[str] | None = None, + limitations: list[str] | None = None, + performance_notes: list[str] | None = None, +) -> dict: + """Create a standardized feature dict.""" + feature = { + 'id': id, + 'name': name, + 'category': category, + 'status': status, + 'description': description, + } + if subcategory: + feature['subcategory'] = subcategory + if configuration: + feature['configuration'] = configuration + if examples: + feature['examples'] = examples + if tags: + feature['tags'] = tags + if related_features: + feature['relatedFeatures'] = related_features + if source_files: + feature['sourceFiles'] = source_files + if dependencies: + feature['dependencies'] = dependencies + if use_cases: + feature['useCases'] = use_cases + if limitations: + feature['limitations'] = limitations + if performance_notes: + feature['performanceNotes'] = performance_notes + return feature + + +def make_config_option( + name: str, + type: str, + description: str, + required: bool = False, + default: Any = None, + scope: str = "global", + valid_values: list | None = None, + range: dict | None = None, + yaml_path: str = "", + env_var: str = "", + scala_constant: str = "", +) -> dict: + """Create a standardized configuration option dict.""" + opt = { + 'name': name, + 'type': type, + 'description': description, + 'required': required, + 'scope': scope, + } + if default is not None: + opt['default'] = default + if valid_values: + opt['validValues'] = valid_values + if range: + opt['range'] = range + if yaml_path: + opt['yamlPath'] = yaml_path + if env_var: + opt['envVar'] = env_var + if scala_constant: + opt['scalaConstant'] = scala_constant + return opt + + +def make_example(format: str, code: str, title: str = "", description: str = "") -> dict: + """Create a standardized example dict.""" + ex = {'format': format, 'code': code} + if title: + ex['title'] = title + if description: + ex['description'] = description + return ex diff --git a/mkdocs.yml b/mkdocs.yml index 9ccda878..d7f190c2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -104,6 +104,7 @@ nav: - Marquez: 'docs/guide/data-source/metadata/marquez.md' - OpenMetadata: 'docs/guide/data-source/metadata/open-metadata.md' - Open Data Contract Standard (ODCS): 'docs/guide/data-source/metadata/open-data-contract-standard.md' + - Data Caterer YAML: 'docs/guide/data-source/metadata/yaml-configurations.md' - Connections: 'docs/connection.md' - Generator: - Data Generator: 'docs/generator/data-generator.md' @@ -140,8 +141,10 @@ nav: - A year of getting paid from Medium articles: 'use-case/blog/a-year-of-getting-paid-from-medium-articles.md' - Shift Left Data Quality: 'use-case/blog/shift-left-data-quality.md' - Changelog: + - 0.19.1: 'use-case/changelog/0.19.1.md' - 0.19.0: 'use-case/changelog/0.19.0.md' - 0.18.0: 'use-case/changelog/0.18.0.md' + - 0.17.3: 'use-case/changelog/0.17.3.md' - 0.17.2: 'use-case/changelog/0.17.2.md' - 0.17.1: 'use-case/changelog/0.17.1.md' - 0.17.0: 'use-case/changelog/0.17.0.md' From 8046b48c0e1bfc436f13b57e6f952e2134ae301f Mon Sep 17 00:00:00 2001 From: Flook Peter Date: Sat, 14 Feb 2026 18:41:08 +1100 Subject: [PATCH 2/3] chore: Add spark.metrics.executorMetricsSource.enabled configuration - Updated configuration files to include "spark.metrics.executorMetricsSource.enabled" set to "false" for improved metrics handling. - Ensured consistency across application and test configurations by aligning the settings in Constants.scala and application.conf. - Modified SparkSuite to reflect the new configuration, enhancing test environment setup. --- .../github/datacatering/datacaterer/api/model/Constants.scala | 3 ++- app/src/test/resources/application.conf | 3 ++- .../github/datacatering/datacaterer/core/util/SparkSuite.scala | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala index 591376c5..51de4cf0 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala @@ -288,7 +288,8 @@ object Constants { "spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled" -> "true", "spark.hadoop.fs.hdfs.impl" -> "org.apache.hadoop.hdfs.DistributedFileSystem", "spark.hadoop.fs.file.impl" -> "com.globalmentor.apache.hadoop.fs.BareLocalFileSystem", - "spark.sql.extensions" -> "io.delta.sql.DeltaSparkSessionExtension,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions" + "spark.sql.extensions" -> "io.delta.sql.DeltaSparkSessionExtension,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", + "spark.metrics.executorMetricsSource.enabled" -> "false" ) //jdbc defaults diff --git a/app/src/test/resources/application.conf b/app/src/test/resources/application.conf index cc30ce98..3e82c5d5 100644 --- a/app/src/test/resources/application.conf +++ b/app/src/test/resources/application.conf @@ -40,7 +40,8 @@ runtime{ "spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled": "true", "spark.hadoop.fs.hdfs.impl": "org.apache.hadoop.hdfs.DistributedFileSystem", "spark.hadoop.fs.file.impl": "com.globalmentor.apache.hadoop.fs.BareLocalFileSystem", - "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions" + "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", + "spark.metrics.executorMetricsSource.enabled": "false" } } diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/util/SparkSuite.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/util/SparkSuite.scala index 09f6bf81..80df3935 100644 --- a/app/src/test/scala/io/github/datacatering/datacaterer/core/util/SparkSuite.scala +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/util/SparkSuite.scala @@ -14,6 +14,7 @@ trait SparkSuite extends AnyFunSuite with BeforeAndAfterAll with BeforeAndAfterE .config("spark.sql.shuffle.partitions", "2") // .config("spark.ui.enabled", "false") .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension") + .config("spark.metrics.executorMetricsSource.enabled", "false") .getOrCreate() } From 7ff50a2500d91a6eda07be5d8118ef9c73e541f1 Mon Sep 17 00:00:00 2001 From: Flook Peter Date: Sat, 14 Feb 2026 19:08:08 +1100 Subject: [PATCH 3/3] chore: Update Java distribution and enhance Spark configurations - Changed Java distribution from Oracle to Temurin in build and check workflows for improved compatibility. - Added "spark.executor.processTreeMetrics.enabled" configuration to multiple files to enhance Spark metrics handling. - Ensured consistency across application and test configurations by aligning the settings in Constants.scala, application.conf, and application-integration.conf. --- .github/workflows/build.yml | 2 +- .github/workflows/check.yml | 2 +- .../datacatering/datacaterer/api/model/Constants.scala | 3 ++- app/src/test/resources/application-integration.conf | 5 ++++- app/src/test/resources/application.conf | 4 +++- .../datacatering/datacaterer/core/util/SparkSuite.scala | 2 +- 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5ed18727..7ece6a50 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -35,7 +35,7 @@ jobs: java-version: '17' java-package: jdk architecture: x64 - distribution: oracle + distribution: temurin - name: Login to DockerHub uses: docker/login-action@v2 with: diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index f46db921..b105ee00 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -15,7 +15,7 @@ jobs: java-version: '17' java-package: jdk architecture: x64 - distribution: oracle + distribution: temurin - name: Gradle build with cache uses: burrunan/gradle-cache-action@v1 with: diff --git a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala index 51de4cf0..705948d4 100644 --- a/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala +++ b/api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala @@ -289,7 +289,8 @@ object Constants { "spark.hadoop.fs.hdfs.impl" -> "org.apache.hadoop.hdfs.DistributedFileSystem", "spark.hadoop.fs.file.impl" -> "com.globalmentor.apache.hadoop.fs.BareLocalFileSystem", "spark.sql.extensions" -> "io.delta.sql.DeltaSparkSessionExtension,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", - "spark.metrics.executorMetricsSource.enabled" -> "false" + "spark.metrics.executorMetricsSource.enabled" -> "false", + "spark.executor.processTreeMetrics.enabled" -> "false" ) //jdbc defaults diff --git a/app/src/test/resources/application-integration.conf b/app/src/test/resources/application-integration.conf index 7b4dbd27..7a07086b 100644 --- a/app/src/test/resources/application-integration.conf +++ b/app/src/test/resources/application-integration.conf @@ -110,7 +110,10 @@ runtime { "spark.hadoop.fs.s3a.bucket.all.committer.magic.enabled" = "true", "spark.hadoop.fs.hdfs.impl" = "org.apache.hadoop.hdfs.DistributedFileSystem", "spark.hadoop.fs.file.impl" = "com.globalmentor.apache.hadoop.fs.BareLocalFileSystem", - "spark.sql.extensions" = "io.delta.sql.DeltaSparkSessionExtension,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions" + "spark.sql.extensions" = "io.delta.sql.DeltaSparkSessionExtension,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", + "spark.metrics.executorMetricsSource.enabled" = "false", + "spark.ui.enabled" = "false", + "spark.executor.processTreeMetrics.enabled" = "false" } } diff --git a/app/src/test/resources/application.conf b/app/src/test/resources/application.conf index 3e82c5d5..abd086c1 100644 --- a/app/src/test/resources/application.conf +++ b/app/src/test/resources/application.conf @@ -41,7 +41,9 @@ runtime{ "spark.hadoop.fs.hdfs.impl": "org.apache.hadoop.hdfs.DistributedFileSystem", "spark.hadoop.fs.file.impl": "com.globalmentor.apache.hadoop.fs.BareLocalFileSystem", "spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension,org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions", - "spark.metrics.executorMetricsSource.enabled": "false" + "spark.metrics.executorMetricsSource.enabled": "false", + "spark.ui.enabled": "false", + "spark.executor.processTreeMetrics.enabled": "false" } } diff --git a/app/src/test/scala/io/github/datacatering/datacaterer/core/util/SparkSuite.scala b/app/src/test/scala/io/github/datacatering/datacaterer/core/util/SparkSuite.scala index 80df3935..22468c7f 100644 --- a/app/src/test/scala/io/github/datacatering/datacaterer/core/util/SparkSuite.scala +++ b/app/src/test/scala/io/github/datacatering/datacaterer/core/util/SparkSuite.scala @@ -12,7 +12,7 @@ trait SparkSuite extends AnyFunSuite with BeforeAndAfterAll with BeforeAndAfterE .appName("spark tests") .config("spark.sql.legacy.allowUntypedScalaUDF", "true") .config("spark.sql.shuffle.partitions", "2") -// .config("spark.ui.enabled", "false") + .config("spark.ui.enabled", "false") .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension") .config("spark.metrics.executorMetricsSource.enabled", "false") .getOrCreate()