Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions docs/content/en/latest/pipelines/ldm_extension/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ The custom dataset represents a new dataset appended to the child LDM. It is def
| dataset_source_table | string | Name of the table in the Physical Data Model. |
| dataset_source_sql | string \| None | SQL query defining the dataset. |
| parent_dataset_reference | string \| None | ID of the parent dataset to which the custom one will be connected. |
| parent_dataset_reference_attribute_id | string | ID of the attribute used for creating the relationship in the parent dataset. |
| dataset_reference_source_column | string | Name of the column used for creating the relationship in the custom dataset. |
| dataset_reference_source_column_data_type | [ColumnDataType](#columndatatype) | Column data type. |
| parent_dataset_reference_attribute_id | string \| None | **Deprecated** — use `parent_dataset_references` instead. |
| dataset_reference_source_column | string \| None | **Deprecated** — use `parent_dataset_references` instead. |
| dataset_reference_source_column_data_type | [ColumnDataType](#columndatatype) \| None | **Deprecated** — use `parent_dataset_references` instead. |
| parent_dataset_references | [ParentDatasetReference](#parentdatasetreference)[] \| None | List of references to the parent dataset. |
| workspace_data_filter_id | string | ID of the workspace data filter to use. |
| workspace_data_filter_column_name | string | Name of the column in custom dataset used for filtering. |
| dataset_description | string \| None | Optional declarative description on the custom dataset. |
Expand All @@ -52,6 +53,18 @@ The custom dataset represents a new dataset appended to the child LDM. It is def

Either `dataset_source_table` or `dataset_source_sql` must be specified with a truthy value, but not both. An exception will be raised if both parameters are falsy or if both have truthy values.

`parent_dataset_references` must contain at least one entry.

#### ParentDatasetReference

Bundles one column of a (possibly composite) join to the parent dataset. Pass a list of these on `CustomDatasetDefinition.parent_dataset_references`, one entry per join column.

| name | type | description |
|------|------|-------------|
| attribute_id | string | ID of the attribute on the parent dataset that this column joins to. |
| source_column | string | Name of the column on this dataset used to join to the parent. |
| data_type | [ColumnDataType](#columndatatype) | Data type of the source column. |

### Custom Field Definitions

The custom fields define the individual fields in the custom datasets defined above. Each custom field needs to be specified with the following parameters:
Expand Down Expand Up @@ -162,6 +175,7 @@ from gooddata_pipelines import (
CustomFieldDefinition,
CustomFieldType,
LdmExtensionManager,
ParentDatasetReference,
)

import logging
Expand All @@ -188,9 +202,13 @@ custom_dataset_definitions = [
dataset_source_table="products_custom",
dataset_source_sql=None,
parent_dataset_reference="products",
parent_dataset_reference_attribute_id="products.product_id",
dataset_reference_source_column="product_id",
dataset_reference_source_column_data_type=ColumnDataType.INT,
parent_dataset_references=[
ParentDatasetReference(
attribute_id="products.product_id",
source_column="product_id",
data_type=ColumnDataType.INT,
),
],
workspace_data_filter_id="wdf_id",
workspace_data_filter_column_name="wdf_column",
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
CustomDatasetDefinition,
CustomFieldDefinition,
CustomFieldType,
ParentDatasetReference,
)

# -------- Provisioning --------
Expand Down Expand Up @@ -93,6 +94,7 @@
"CustomFieldDefinition",
"ColumnDataType",
"CustomFieldType",
"ParentDatasetReference",
"provision",
"WorkflowType",
"__version__",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,46 @@ def _date_ref_from_field(
],
)

@staticmethod
def _build_parent_reference_sources(
definition: CustomDatasetDefinition,
) -> list[CatalogDeclarativeReferenceSource]:
"""Build the reference sources from either the new list or the legacy triple."""
if definition.parent_dataset_references:
return [
CatalogDeclarativeReferenceSource(
column=ref.source_column,
data_type=ref.data_type.value,
target=CatalogGrainIdentifier(
id=ref.attribute_id,
type=CustomFieldType.ATTRIBUTE.value,
),
)
for ref in definition.parent_dataset_references
]

# `check_reference_form` on the model guarantees all three legacy
# fields are set when `parent_dataset_references` is empty.
if (
definition.parent_dataset_reference_attribute_id is None
or definition.dataset_reference_source_column is None
or definition.dataset_reference_source_column_data_type is None
):
raise ValueError(
"Legacy reference fields must be set when "
"`parent_dataset_references` is not provided."
)
return [
CatalogDeclarativeReferenceSource(
column=definition.dataset_reference_source_column,
data_type=definition.dataset_reference_source_column_data_type.value,
target=CatalogGrainIdentifier(
id=definition.parent_dataset_reference_attribute_id,
type=CustomFieldType.ATTRIBUTE.value,
),
)
]

@staticmethod
def _get_sources(
dataset: CustomDataset,
Expand Down Expand Up @@ -253,6 +293,10 @@ def datasets_to_ldm(
# Get the data source info
dataset_source_table_id, dataset_sql = self._get_sources(dataset)

parent_reference_sources = self._build_parent_reference_sources(
dataset.definition
)

# Construct the declarative dataset object and append it to the list.
declarative_datasets.append(
CatalogDeclarativeDataset(
Expand All @@ -265,16 +309,7 @@ def datasets_to_ldm(
id=dataset.definition.parent_dataset_reference,
),
multivalue=True,
sources=[
CatalogDeclarativeReferenceSource(
column=dataset.definition.dataset_reference_source_column,
data_type=dataset.definition.dataset_reference_source_column_data_type.value,
target=CatalogGrainIdentifier(
id=dataset.definition.parent_dataset_reference_attribute_id,
type=CustomFieldType.ATTRIBUTE.value,
),
)
],
sources=parent_reference_sources,
),
]
+ date_references,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,25 @@ def check_ids_not_equal(self) -> "CustomFieldDefinition":
return self


class ParentDatasetReference(BaseModel):
"""One column of a (possibly composite) join to the parent dataset.

A list of these on ``CustomDatasetDefinition.parent_dataset_references``
supports multi-column foreign keys. Each entry binds a source column on the
new dataset to a grain attribute on the parent.
"""

attribute_id: str = Field(
description="Attribute ID on the parent dataset that this column joins to.",
)
source_column: str = Field(
description="Column name on this dataset used to join to the parent.",
)
data_type: ColumnDataType = Field(
description="Data type of the source column.",
)


class CustomDatasetDefinition(BaseModel):
"""Input model for custom dataset definition."""

Expand All @@ -71,9 +90,31 @@ class CustomDatasetDefinition(BaseModel):
dataset_source_table: str | None
dataset_source_sql: str | None
parent_dataset_reference: str
parent_dataset_reference_attribute_id: str
dataset_reference_source_column: str
dataset_reference_source_column_data_type: ColumnDataType
parent_dataset_reference_attribute_id: str | None = Field(
default=None,
deprecated=(
"Use `parent_dataset_references` instead. "
"This field will be removed in a future release."
),
)
dataset_reference_source_column: str | None = Field(
default=None,
deprecated=(
"Use `parent_dataset_references` instead. "
"This field will be removed in a future release."
),
)
dataset_reference_source_column_data_type: ColumnDataType | None = Field(
default=None,
deprecated=(
"Use `parent_dataset_references` instead. "
"This field will be removed in a future release."
),
)
parent_dataset_references: list[ParentDatasetReference] | None = Field(
default=None,
description="List of references to the parent dataset.",
)
workspace_data_filter_id: str
workspace_data_filter_column_name: str
dataset_description: str | None = Field(
Expand All @@ -98,6 +139,31 @@ def check_source(self) -> "CustomDatasetDefinition":
)
return self

@model_validator(mode="after")
def check_reference_form(self) -> "CustomDatasetDefinition":
"""Exactly one reference form must be set: either the new list or the legacy triple."""
has_new = bool(self.parent_dataset_references)
has_legacy = (
self.parent_dataset_reference_attribute_id is not None
or self.dataset_reference_source_column is not None
or self.dataset_reference_source_column_data_type is not None
)
if has_new and has_legacy:
raise ValueError(
"Set either `parent_dataset_references` or the legacy single-column "
"fields (`parent_dataset_reference_attribute_id`, "
"`dataset_reference_source_column`, "
"`dataset_reference_source_column_data_type`), not both."
)
if not has_new and not has_legacy:
raise ValueError(
"Provide either `parent_dataset_references` or the legacy single-column "
"fields (`parent_dataset_reference_attribute_id`, "
"`dataset_reference_source_column`, "
"`dataset_reference_source_column_data_type`)."
)
return self


class CustomDataset(BaseModel):
"""Custom dataset with its definition and custom fields."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,52 @@ def test_datasets_to_ldm(mock_custom_dataset):
assert ds.workspace_data_filter_references[0].filter_id.id == "wdf1"
assert len(ldm.date_instances) == 1
assert ldm.date_instances[0].id == "date1"


def test_datasets_to_ldm_parent_dataset_references_composite():
"""Multi-column references via `parent_dataset_references` produce N sources."""
from gooddata_pipelines.ldm_extension.models.custom_data_object import (
CustomDatasetDefinition,
ParentDatasetReference,
)

definition = CustomDatasetDefinition(
workspace_id="workspace1",
dataset_id="ds_composite",
dataset_name="Composite Dataset",
dataset_source_table="table1",
dataset_datasource_id="ds_source",
dataset_source_sql=None,
parent_dataset_reference="parent_ds",
parent_dataset_references=[
ParentDatasetReference(
attribute_id="parent_pk1",
source_column="src_col1",
data_type=ColumnDataType.STRING,
),
ParentDatasetReference(
attribute_id="parent_pk2",
source_column="src_col2",
data_type=ColumnDataType.INT,
),
],
workspace_data_filter_id="wdf1",
workspace_data_filter_column_name="col1",
)
ds = CustomDataset(definition=definition, custom_fields=[])
processor = LdmExtensionDataProcessor()
model = processor.datasets_to_ldm({"ds_composite": ds})
parent_ref = model.ldm.datasets[0].references[0]
assert len(parent_ref.sources) == 2
assert [s.column for s in parent_ref.sources] == ["src_col1", "src_col2"]


def test_datasets_to_ldm_legacy_reference_fallback(mock_dataset_definition):
"""When `parent_dataset_references` is not set, fall back to legacy fields."""
mock_dataset_definition.parent_dataset_references = None
ds = CustomDataset(definition=mock_dataset_definition, custom_fields=[])
processor = LdmExtensionDataProcessor()
model = processor.datasets_to_ldm({"ds1": ds})
parent_ref = model.ldm.datasets[0].references[0]
assert len(parent_ref.sources) == 1
assert parent_ref.sources[0].column == "ref_col"
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
CustomDatasetDefinition,
CustomFieldDefinition,
CustomFieldType,
ParentDatasetReference,
)


Expand Down Expand Up @@ -100,3 +101,81 @@ def test_custom_dataset_model():
assert dataset.definition.dataset_id == "ds1"
assert len(dataset.custom_fields) == 1
assert dataset.custom_fields[0].custom_field_id == "cf1"


def test_custom_dataset_definition_parent_dataset_references_optional():
"""The new composite-reference field is optional and defaults to None."""
ds = CustomDatasetDefinition(**make_valid_dataset_def())
assert ds.parent_dataset_references is None


def test_custom_dataset_definition_parent_dataset_references_accepted():
"""Composite references can be provided via the new list field."""
refs = [
ParentDatasetReference(
attribute_id="parent_pk1",
source_column="src_col1",
data_type=ColumnDataType.STRING,
),
ParentDatasetReference(
attribute_id="parent_pk2",
source_column="src_col2",
data_type=ColumnDataType.INT,
),
]
data = make_valid_dataset_def(
parent_dataset_reference_attribute_id=None,
dataset_reference_source_column=None,
dataset_reference_source_column_data_type=None,
parent_dataset_references=refs,
)
ds = CustomDatasetDefinition(**data)
assert ds.parent_dataset_references is not None
assert len(ds.parent_dataset_references) == 2
assert ds.parent_dataset_references[1].data_type == ColumnDataType.INT


def test_custom_dataset_definition_no_reference_form_raises():
"""Providing neither the legacy fields nor `parent_dataset_references` is rejected."""
data = make_valid_dataset_def(
parent_dataset_reference_attribute_id=None,
dataset_reference_source_column=None,
dataset_reference_source_column_data_type=None,
)
with pytest.raises(ValidationError) as exc:
CustomDatasetDefinition(**data)
assert "Provide either" in str(exc.value)


def test_custom_dataset_definition_mixed_reference_forms_raises():
"""Setting both legacy fields and `parent_dataset_references` is rejected."""
data = make_valid_dataset_def(
parent_dataset_references=[
ParentDatasetReference(
attribute_id="parent_pk",
source_column="src_col",
data_type=ColumnDataType.STRING,
)
],
)
with pytest.raises(ValidationError) as exc:
CustomDatasetDefinition(**data)
assert "not both" in str(exc.value)


def test_custom_dataset_definition_legacy_reference_fields_optional():
data = make_valid_dataset_def(
parent_dataset_reference_attribute_id=None,
dataset_reference_source_column=None,
dataset_reference_source_column_data_type=None,
parent_dataset_references=[
ParentDatasetReference(
attribute_id="parent_pk",
source_column="src_col",
data_type=ColumnDataType.STRING,
)
],
)
ds = CustomDatasetDefinition(**data)
assert ds.dataset_reference_source_column is None
assert ds.parent_dataset_references is not None
Loading