Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 9 additions & 24 deletions import-automation/workflow/ingestion-helper/aggregation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,8 @@ def run_linked_contained_in_place(self, import_names: List[str] = None) -> None:
return

dest = self.executor.get_spanner_destination_uri()
# Escape single quotes to prevent SQL injection
safe_names = [name.replace("'", "''") for name in import_names]
prefix = "dc/base/" if self.is_base_dc else ""
provenances = [f"'{prefix}{name}'" for name in safe_names]
provenances = [f"'dc/base/{name}'" for name in import_names]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-critical critical

Reverting this change introduces a SQL injection vulnerability because single quotes in import_names (which are sourced from user-controlled HTTP request payloads) are no longer escaped. Additionally, hardcoding the 'dc/base/' prefix breaks functionality when self.is_base_dc is False.

        safe_names = [name.replace("'", "''") for name in import_names]
        prefix = "dc/base/" if self.is_base_dc else ""
        provenances = [f"'{prefix}{name}'" for name in safe_names]
        gen_graphs_prov = 'dc/base/GeneratedGraphs' if self.is_base_dc else 'GeneratedGraphs'

provenance_filter = f" AND provenance IN ({', '.join(provenances)})"
gen_graphs_prov = 'dc/base/GeneratedGraphs' if self.is_base_dc else 'GeneratedGraphs'

query = f"""
-- Pull base edges needed for containedInPlace aggregation
Expand Down Expand Up @@ -142,7 +138,7 @@ def run_linked_contained_in_place(self, import_names: List[str] = None) -> None:
subject_id,
'linkedContainedInPlace' as predicate,
ancestor_place as object_id,
'{gen_graphs_prov}' as provenance
'dc/base/GeneratedGraphs' as provenance
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Hardcoding 'dc/base/GeneratedGraphs' breaks support for non-base Data Commons instances where self.is_base_dc is False. Use the dynamic gen_graphs_prov variable instead.

Suggested change
'dc/base/GeneratedGraphs' as provenance
'{gen_graphs_prov}' as provenance

FROM
Ancestors
),
Expand Down Expand Up @@ -179,12 +175,8 @@ def run_linked_member_of(self, import_names: List[str] = None) -> None:
return

dest = self.executor.get_spanner_destination_uri()
# Escape single quotes to prevent SQL injection
safe_names = [name.replace("'", "''") for name in import_names]
prefix = "dc/base/" if self.is_base_dc else ""
provenances = [f"'{prefix}{name}'" for name in safe_names]
provenances = [f"'dc/base/{name}'" for name in import_names]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-critical critical

Reverting this change introduces a SQL injection vulnerability because single quotes in import_names are no longer escaped. Additionally, hardcoding the 'dc/base/' prefix breaks functionality when self.is_base_dc is False.

        safe_names = [name.replace("'", "''") for name in import_names]
        prefix = "dc/base/" if self.is_base_dc else ""
        provenances = [f"'{prefix}{name}'" for name in safe_names]
        gen_graphs_prov = 'dc/base/GeneratedGraphs' if self.is_base_dc else 'GeneratedGraphs'

provenance_filter = f" AND provenance IN ({', '.join(provenances)})"
gen_graphs_prov = 'dc/base/GeneratedGraphs' if self.is_base_dc else 'GeneratedGraphs'

query = f"""
-- Pull base edges needed for memberOf aggregation
Expand Down Expand Up @@ -234,7 +226,7 @@ def run_linked_member_of(self, import_names: List[str] = None) -> None:
subject_id,
'linkedMemberOf' as predicate,
ancestor as object_id,
'{gen_graphs_prov}' as provenance
'dc/base/GeneratedGraphs' as provenance
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Hardcoding 'dc/base/GeneratedGraphs' breaks support for non-base Data Commons instances where self.is_base_dc is False. Use the dynamic gen_graphs_prov variable instead.

Suggested change
'dc/base/GeneratedGraphs' as provenance
'{gen_graphs_prov}' as provenance

FROM
Ancestors
),
Expand Down Expand Up @@ -271,12 +263,8 @@ def run_linked_member(self, import_names: List[str] = None) -> None:
return

dest = self.executor.get_spanner_destination_uri()
# Escape single quotes to prevent SQL injection
safe_names = [name.replace("'", "''") for name in import_names]
prefix = "dc/base/" if self.is_base_dc else ""
provenances = [f"'{prefix}{name}'" for name in safe_names]
provenances = [f"'dc/base/{name}'" for name in import_names]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-critical critical

Reverting this change introduces a SQL injection vulnerability because single quotes in import_names are no longer escaped. Additionally, hardcoding the 'dc/base/' prefix breaks functionality when self.is_base_dc is False.

        safe_names = [name.replace("'", "''") for name in import_names]
        prefix = "dc/base/" if self.is_base_dc else ""
        provenances = [f"'{prefix}{name}'" for name in safe_names]
        gen_graphs_prov = 'dc/base/GeneratedGraphs' if self.is_base_dc else 'GeneratedGraphs'

provenance_filter = f" AND provenance IN ({', '.join(provenances)})"
gen_graphs_prov = 'dc/base/GeneratedGraphs' if self.is_base_dc else 'GeneratedGraphs'

query = f"""
-- Pull base edges needed for member aggregation
Expand Down Expand Up @@ -324,7 +312,7 @@ def run_linked_member(self, import_names: List[str] = None) -> None:
descendant as subject_id,
'linkedMember' as predicate,
subject_id as object_id,
'{gen_graphs_prov}' as provenance
'dc/base/GeneratedGraphs' as provenance
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Hardcoding 'dc/base/GeneratedGraphs' breaks support for non-base Data Commons instances where self.is_base_dc is False. Use the dynamic gen_graphs_prov variable instead.

Suggested change
'dc/base/GeneratedGraphs' as provenance
'{gen_graphs_prov}' as provenance

FROM
Descendants
WHERE subject_id LIKE 'dc/topic%'
Expand Down Expand Up @@ -382,11 +370,8 @@ def run_provenance_summary_aggregation(self, import_names: List[str]) -> None:
dest = self.executor.get_spanner_destination_uri()
connection_id = self.executor.connection_id

# Escape single quotes to prevent SQL injection
safe_names = [name.replace("'", "''") for name in import_names]
# Format import names for the SQL IN clause
imports_str = ", ".join([f"'{name}'" for name in safe_names])
provenance_dcid_expr = "CONCAT('dc/base/', raw.import_name)" if self.is_base_dc else "raw.import_name"
imports_str = ", ".join([f"'{name}'" for name in import_names])
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-critical critical

Reverting this change introduces a SQL injection vulnerability because single quotes in import_names are no longer escaped. Additionally, hardcoding the 'dc/base/' prefix breaks functionality when self.is_base_dc is False.

        safe_names = [name.replace("'", "''") for name in import_names]
        imports_str = ", ".join([f"'{name}'" for name in safe_names])
        provenance_dcid_expr = "CONCAT('dc/base/', raw.import_name)" if self.is_base_dc else "raw.import_name"


query = f"""
-- Step 1: Fetch Observation rows for the specific import
Expand Down Expand Up @@ -450,7 +435,7 @@ def run_provenance_summary_aggregation(self, import_names: List[str]) -> None:
raw.is_dc_aggregate,
JSON_VALUE(v, '$.key') as date_val,
SAFE_CAST(JSON_VALUE(v, '$.value') AS FLOAT64) as value_num,
{provenance_dcid_expr} as provenance_dcid,
CONCAT('dc/base/', raw.import_name) as provenance_dcid,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Hardcoding 'dc/base/' prefix breaks support for non-base Data Commons instances where self.is_base_dc is False. Use the dynamic provenance_dcid_expr variable instead.

Suggested change
CONCAT('dc/base/', raw.import_name) as provenance_dcid,
{provenance_dcid_expr} as provenance_dcid,

nodes.name as place_name,
edges.place_type
FROM `temp_obs_raw` raw
Expand Down Expand Up @@ -588,7 +573,7 @@ def __init__(self,
project_id=project_id,
instance_id=instance_id,
database_id=database_id,
location=location
location=location,
)
self.linked_edge_generator = LinkedEdgeGenerator(self.executor, is_base_dc)
self.provenance_summary_generator = ProvenanceSummaryGenerator(self.executor, is_base_dc)
Expand Down
Loading