From ff2c33df1407b57d075b8fc22180a74e09246968 Mon Sep 17 00:00:00 2001 From: Anna Benke Date: Tue, 12 May 2026 15:50:16 +0200 Subject: [PATCH] feat(gooddata-pipelines): support composite key references on parent datasets Co-Authored-By: Claude Sonnet 4.6 --- .../latest/pipelines/ldm_extension/_index.md | 30 +++++-- .../src/gooddata_pipelines/__init__.py | 2 + .../ldm_extension/input_processor.py | 55 ++++++++++--- .../models/custom_data_object.py | 72 ++++++++++++++++- .../test_input_processor.py | 49 ++++++++++++ .../test_models/test_custom_data_object.py | 79 +++++++++++++++++++ 6 files changed, 268 insertions(+), 19 deletions(-) diff --git a/docs/content/en/latest/pipelines/ldm_extension/_index.md b/docs/content/en/latest/pipelines/ldm_extension/_index.md index 2b5ff949f..0a32d9c18 100644 --- a/docs/content/en/latest/pipelines/ldm_extension/_index.md +++ b/docs/content/en/latest/pipelines/ldm_extension/_index.md @@ -40,9 +40,10 @@ The custom dataset represents a new dataset appended to the child LDM. It is def | dataset_source_table | string | Name of the table in the Physical Data Model. | | dataset_source_sql | string \| None | SQL query defining the dataset. | | parent_dataset_reference | string \| None | ID of the parent dataset to which the custom one will be connected. | -| parent_dataset_reference_attribute_id | string | ID of the attribute used for creating the relationship in the parent dataset. | -| dataset_reference_source_column | string | Name of the column used for creating the relationship in the custom dataset. | -| dataset_reference_source_column_data_type | [ColumnDataType](#columndatatype) | Column data type. | +| parent_dataset_reference_attribute_id | string \| None | **Deprecated** — use `parent_dataset_references` instead. | +| dataset_reference_source_column | string \| None | **Deprecated** — use `parent_dataset_references` instead. | +| dataset_reference_source_column_data_type | [ColumnDataType](#columndatatype) \| None | **Deprecated** — use `parent_dataset_references` instead. | +| parent_dataset_references | [ParentDatasetReference](#parentdatasetreference)[] \| None | List of references to the parent dataset. | | workspace_data_filter_id | string | ID of the workspace data filter to use. | | workspace_data_filter_column_name | string | Name of the column in custom dataset used for filtering. | | dataset_description | string \| None | Optional declarative description on the custom dataset. | @@ -52,6 +53,18 @@ The custom dataset represents a new dataset appended to the child LDM. It is def Either `dataset_source_table` or `dataset_source_sql` must be specified with a truthy value, but not both. An exception will be raised if both parameters are falsy or if both have truthy values. +`parent_dataset_references` must contain at least one entry. + +#### ParentDatasetReference + +Bundles one column of a (possibly composite) join to the parent dataset. Pass a list of these on `CustomDatasetDefinition.parent_dataset_references`, one entry per join column. + +| name | type | description | +|------|------|-------------| +| attribute_id | string | ID of the attribute on the parent dataset that this column joins to. | +| source_column | string | Name of the column on this dataset used to join to the parent. | +| data_type | [ColumnDataType](#columndatatype) | Data type of the source column. | + ### Custom Field Definitions The custom fields define the individual fields in the custom datasets defined above. Each custom field needs to be specified with the following parameters: @@ -162,6 +175,7 @@ from gooddata_pipelines import ( CustomFieldDefinition, CustomFieldType, LdmExtensionManager, + ParentDatasetReference, ) import logging @@ -188,9 +202,13 @@ custom_dataset_definitions = [ dataset_source_table="products_custom", dataset_source_sql=None, parent_dataset_reference="products", - parent_dataset_reference_attribute_id="products.product_id", - dataset_reference_source_column="product_id", - dataset_reference_source_column_data_type=ColumnDataType.INT, + parent_dataset_references=[ + ParentDatasetReference( + attribute_id="products.product_id", + source_column="product_id", + data_type=ColumnDataType.INT, + ), + ], workspace_data_filter_id="wdf_id", workspace_data_filter_column_name="wdf_column", ) diff --git a/packages/gooddata-pipelines/src/gooddata_pipelines/__init__.py b/packages/gooddata-pipelines/src/gooddata_pipelines/__init__.py index 0f5bd3236..f3c8920da 100644 --- a/packages/gooddata-pipelines/src/gooddata_pipelines/__init__.py +++ b/packages/gooddata-pipelines/src/gooddata_pipelines/__init__.py @@ -26,6 +26,7 @@ CustomDatasetDefinition, CustomFieldDefinition, CustomFieldType, + ParentDatasetReference, ) # -------- Provisioning -------- @@ -93,6 +94,7 @@ "CustomFieldDefinition", "ColumnDataType", "CustomFieldType", + "ParentDatasetReference", "provision", "WorkflowType", "__version__", diff --git a/packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/input_processor.py b/packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/input_processor.py index 6f43a037c..3c2d9afc2 100644 --- a/packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/input_processor.py +++ b/packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/input_processor.py @@ -154,6 +154,46 @@ def _date_ref_from_field( ], ) + @staticmethod + def _build_parent_reference_sources( + definition: CustomDatasetDefinition, + ) -> list[CatalogDeclarativeReferenceSource]: + """Build the reference sources from either the new list or the legacy triple.""" + if definition.parent_dataset_references: + return [ + CatalogDeclarativeReferenceSource( + column=ref.source_column, + data_type=ref.data_type.value, + target=CatalogGrainIdentifier( + id=ref.attribute_id, + type=CustomFieldType.ATTRIBUTE.value, + ), + ) + for ref in definition.parent_dataset_references + ] + + # `check_reference_form` on the model guarantees all three legacy + # fields are set when `parent_dataset_references` is empty. + if ( + definition.parent_dataset_reference_attribute_id is None + or definition.dataset_reference_source_column is None + or definition.dataset_reference_source_column_data_type is None + ): + raise ValueError( + "Legacy reference fields must be set when " + "`parent_dataset_references` is not provided." + ) + return [ + CatalogDeclarativeReferenceSource( + column=definition.dataset_reference_source_column, + data_type=definition.dataset_reference_source_column_data_type.value, + target=CatalogGrainIdentifier( + id=definition.parent_dataset_reference_attribute_id, + type=CustomFieldType.ATTRIBUTE.value, + ), + ) + ] + @staticmethod def _get_sources( dataset: CustomDataset, @@ -253,6 +293,10 @@ def datasets_to_ldm( # Get the data source info dataset_source_table_id, dataset_sql = self._get_sources(dataset) + parent_reference_sources = self._build_parent_reference_sources( + dataset.definition + ) + # Construct the declarative dataset object and append it to the list. declarative_datasets.append( CatalogDeclarativeDataset( @@ -265,16 +309,7 @@ def datasets_to_ldm( id=dataset.definition.parent_dataset_reference, ), multivalue=True, - sources=[ - CatalogDeclarativeReferenceSource( - column=dataset.definition.dataset_reference_source_column, - data_type=dataset.definition.dataset_reference_source_column_data_type.value, - target=CatalogGrainIdentifier( - id=dataset.definition.parent_dataset_reference_attribute_id, - type=CustomFieldType.ATTRIBUTE.value, - ), - ) - ], + sources=parent_reference_sources, ), ] + date_references, diff --git a/packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/models/custom_data_object.py b/packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/models/custom_data_object.py index 9c0dae3a4..48825212d 100644 --- a/packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/models/custom_data_object.py +++ b/packages/gooddata-pipelines/src/gooddata_pipelines/ldm_extension/models/custom_data_object.py @@ -61,6 +61,25 @@ def check_ids_not_equal(self) -> "CustomFieldDefinition": return self +class ParentDatasetReference(BaseModel): + """One column of a (possibly composite) join to the parent dataset. + + A list of these on ``CustomDatasetDefinition.parent_dataset_references`` + supports multi-column foreign keys. Each entry binds a source column on the + new dataset to a grain attribute on the parent. + """ + + attribute_id: str = Field( + description="Attribute ID on the parent dataset that this column joins to.", + ) + source_column: str = Field( + description="Column name on this dataset used to join to the parent.", + ) + data_type: ColumnDataType = Field( + description="Data type of the source column.", + ) + + class CustomDatasetDefinition(BaseModel): """Input model for custom dataset definition.""" @@ -71,9 +90,31 @@ class CustomDatasetDefinition(BaseModel): dataset_source_table: str | None dataset_source_sql: str | None parent_dataset_reference: str - parent_dataset_reference_attribute_id: str - dataset_reference_source_column: str - dataset_reference_source_column_data_type: ColumnDataType + parent_dataset_reference_attribute_id: str | None = Field( + default=None, + deprecated=( + "Use `parent_dataset_references` instead. " + "This field will be removed in a future release." + ), + ) + dataset_reference_source_column: str | None = Field( + default=None, + deprecated=( + "Use `parent_dataset_references` instead. " + "This field will be removed in a future release." + ), + ) + dataset_reference_source_column_data_type: ColumnDataType | None = Field( + default=None, + deprecated=( + "Use `parent_dataset_references` instead. " + "This field will be removed in a future release." + ), + ) + parent_dataset_references: list[ParentDatasetReference] | None = Field( + default=None, + description="List of references to the parent dataset.", + ) workspace_data_filter_id: str workspace_data_filter_column_name: str dataset_description: str | None = Field( @@ -98,6 +139,31 @@ def check_source(self) -> "CustomDatasetDefinition": ) return self + @model_validator(mode="after") + def check_reference_form(self) -> "CustomDatasetDefinition": + """Exactly one reference form must be set: either the new list or the legacy triple.""" + has_new = bool(self.parent_dataset_references) + has_legacy = ( + self.parent_dataset_reference_attribute_id is not None + or self.dataset_reference_source_column is not None + or self.dataset_reference_source_column_data_type is not None + ) + if has_new and has_legacy: + raise ValueError( + "Set either `parent_dataset_references` or the legacy single-column " + "fields (`parent_dataset_reference_attribute_id`, " + "`dataset_reference_source_column`, " + "`dataset_reference_source_column_data_type`), not both." + ) + if not has_new and not has_legacy: + raise ValueError( + "Provide either `parent_dataset_references` or the legacy single-column " + "fields (`parent_dataset_reference_attribute_id`, " + "`dataset_reference_source_column`, " + "`dataset_reference_source_column_data_type`)." + ) + return self + class CustomDataset(BaseModel): """Custom dataset with its definition and custom fields.""" diff --git a/packages/gooddata-pipelines/tests/test_ldm_extension/test_input_processor.py b/packages/gooddata-pipelines/tests/test_ldm_extension/test_input_processor.py index 8c50cd571..84476f211 100644 --- a/packages/gooddata-pipelines/tests/test_ldm_extension/test_input_processor.py +++ b/packages/gooddata-pipelines/tests/test_ldm_extension/test_input_processor.py @@ -129,3 +129,52 @@ def test_datasets_to_ldm(mock_custom_dataset): assert ds.workspace_data_filter_references[0].filter_id.id == "wdf1" assert len(ldm.date_instances) == 1 assert ldm.date_instances[0].id == "date1" + + +def test_datasets_to_ldm_parent_dataset_references_composite(): + """Multi-column references via `parent_dataset_references` produce N sources.""" + from gooddata_pipelines.ldm_extension.models.custom_data_object import ( + CustomDatasetDefinition, + ParentDatasetReference, + ) + + definition = CustomDatasetDefinition( + workspace_id="workspace1", + dataset_id="ds_composite", + dataset_name="Composite Dataset", + dataset_source_table="table1", + dataset_datasource_id="ds_source", + dataset_source_sql=None, + parent_dataset_reference="parent_ds", + parent_dataset_references=[ + ParentDatasetReference( + attribute_id="parent_pk1", + source_column="src_col1", + data_type=ColumnDataType.STRING, + ), + ParentDatasetReference( + attribute_id="parent_pk2", + source_column="src_col2", + data_type=ColumnDataType.INT, + ), + ], + workspace_data_filter_id="wdf1", + workspace_data_filter_column_name="col1", + ) + ds = CustomDataset(definition=definition, custom_fields=[]) + processor = LdmExtensionDataProcessor() + model = processor.datasets_to_ldm({"ds_composite": ds}) + parent_ref = model.ldm.datasets[0].references[0] + assert len(parent_ref.sources) == 2 + assert [s.column for s in parent_ref.sources] == ["src_col1", "src_col2"] + + +def test_datasets_to_ldm_legacy_reference_fallback(mock_dataset_definition): + """When `parent_dataset_references` is not set, fall back to legacy fields.""" + mock_dataset_definition.parent_dataset_references = None + ds = CustomDataset(definition=mock_dataset_definition, custom_fields=[]) + processor = LdmExtensionDataProcessor() + model = processor.datasets_to_ldm({"ds1": ds}) + parent_ref = model.ldm.datasets[0].references[0] + assert len(parent_ref.sources) == 1 + assert parent_ref.sources[0].column == "ref_col" diff --git a/packages/gooddata-pipelines/tests/test_ldm_extension/test_models/test_custom_data_object.py b/packages/gooddata-pipelines/tests/test_ldm_extension/test_models/test_custom_data_object.py index f0c605b15..d45ee97c4 100644 --- a/packages/gooddata-pipelines/tests/test_ldm_extension/test_models/test_custom_data_object.py +++ b/packages/gooddata-pipelines/tests/test_ldm_extension/test_models/test_custom_data_object.py @@ -8,6 +8,7 @@ CustomDatasetDefinition, CustomFieldDefinition, CustomFieldType, + ParentDatasetReference, ) @@ -100,3 +101,81 @@ def test_custom_dataset_model(): assert dataset.definition.dataset_id == "ds1" assert len(dataset.custom_fields) == 1 assert dataset.custom_fields[0].custom_field_id == "cf1" + + +def test_custom_dataset_definition_parent_dataset_references_optional(): + """The new composite-reference field is optional and defaults to None.""" + ds = CustomDatasetDefinition(**make_valid_dataset_def()) + assert ds.parent_dataset_references is None + + +def test_custom_dataset_definition_parent_dataset_references_accepted(): + """Composite references can be provided via the new list field.""" + refs = [ + ParentDatasetReference( + attribute_id="parent_pk1", + source_column="src_col1", + data_type=ColumnDataType.STRING, + ), + ParentDatasetReference( + attribute_id="parent_pk2", + source_column="src_col2", + data_type=ColumnDataType.INT, + ), + ] + data = make_valid_dataset_def( + parent_dataset_reference_attribute_id=None, + dataset_reference_source_column=None, + dataset_reference_source_column_data_type=None, + parent_dataset_references=refs, + ) + ds = CustomDatasetDefinition(**data) + assert ds.parent_dataset_references is not None + assert len(ds.parent_dataset_references) == 2 + assert ds.parent_dataset_references[1].data_type == ColumnDataType.INT + + +def test_custom_dataset_definition_no_reference_form_raises(): + """Providing neither the legacy fields nor `parent_dataset_references` is rejected.""" + data = make_valid_dataset_def( + parent_dataset_reference_attribute_id=None, + dataset_reference_source_column=None, + dataset_reference_source_column_data_type=None, + ) + with pytest.raises(ValidationError) as exc: + CustomDatasetDefinition(**data) + assert "Provide either" in str(exc.value) + + +def test_custom_dataset_definition_mixed_reference_forms_raises(): + """Setting both legacy fields and `parent_dataset_references` is rejected.""" + data = make_valid_dataset_def( + parent_dataset_references=[ + ParentDatasetReference( + attribute_id="parent_pk", + source_column="src_col", + data_type=ColumnDataType.STRING, + ) + ], + ) + with pytest.raises(ValidationError) as exc: + CustomDatasetDefinition(**data) + assert "not both" in str(exc.value) + + +def test_custom_dataset_definition_legacy_reference_fields_optional(): + data = make_valid_dataset_def( + parent_dataset_reference_attribute_id=None, + dataset_reference_source_column=None, + dataset_reference_source_column_data_type=None, + parent_dataset_references=[ + ParentDatasetReference( + attribute_id="parent_pk", + source_column="src_col", + data_type=ColumnDataType.STRING, + ) + ], + ) + ds = CustomDatasetDefinition(**data) + assert ds.dataset_reference_source_column is None + assert ds.parent_dataset_references is not None