Skip to content

Commit 48271b7

Browse files
sagebreeSamson Gebre
andauthored
Sql guardrails, Improve error handling, fix precision defaults for float attribute (#185)
- SQL guardrails — block statement stacking and writes hidden by comments, string literals, or zero-width prefixes - DataFrame MultiIndex columns rejected up-front with a flatten hint instead of producing tuple keys that fail deep in the JSON encoder - records.get() deprecation warning now names both migration paths (retrieve() for single-by-ID and list(filter=...) for filtered queries) - Server error.innererror.message appended to HttpError.message on both single-request and batch paths, so the offending field/dtype reaches users without inspecting the wire payload - "float"/"double" shorthand precision default raised from 2 to 5 (max for DoubleAttributeMetadata.Precision) so values like 2.718 round-trip without silent truncation; decimal/money left at 2 --------- Co-authored-by: Samson Gebre <sagebree@microsoft.com>
1 parent b940024 commit 48271b7

9 files changed

Lines changed: 208 additions & 6 deletions

File tree

src/PowerPlatform/Dataverse/data/_batch_base.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,14 @@ def _raise_top_level_batch_error(response: Any) -> None:
399399
error = payload.get("error", {})
400400
service_error_code = error.get("code") or None
401401
message: str = error.get("message") or response.text or "Unexpected non-multipart response from $batch"
402+
# Dataverse nests a more specific message (e.g. offending field/dtype)
403+
# under error.innererror.message. Append it so users see actionable
404+
# detail without inspecting the raw payload.
405+
innererror = error.get("innererror") if isinstance(error, dict) else None
406+
if isinstance(innererror, dict):
407+
inner_msg = innererror.get("message")
408+
if isinstance(inner_msg, str) and inner_msg.strip() and inner_msg.strip() != message:
409+
message = f"{message}: {inner_msg.strip()}"
402410
except Exception:
403411
message = (getattr(response, "text", None) or "") or "Unexpected non-multipart response from $batch"
404412
raise HttpError(

src/PowerPlatform/Dataverse/data/_odata.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,14 @@ def _request(self, method: str, url: str, *, expected: tuple[int, ...] = _DEFAUL
144144
imsg = inner.get("message")
145145
if isinstance(imsg, str) and imsg.strip():
146146
msg = imsg.strip()
147+
# Dataverse nests a more specific message (e.g. offending
148+
# field/dtype) under error.innererror.message. Append it so
149+
# users don't have to chase the wire payload to debug.
150+
innererror = inner.get("innererror")
151+
if isinstance(innererror, dict):
152+
inner_msg = innererror.get("message")
153+
if isinstance(inner_msg, str) and inner_msg.strip() and inner_msg.strip() != msg:
154+
msg = f"{msg}: {inner_msg.strip()}"
147155
else:
148156
imsg2 = data.get("message")
149157
if isinstance(imsg2, str) and imsg2.strip():

src/PowerPlatform/Dataverse/data/_odata_base.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,10 @@ def _attribute_payload(
515515
"RequiredLevel": {"Value": "None"},
516516
"MinValue": -100000000000.0,
517517
"MaxValue": 100000000000.0,
518-
"Precision": 2,
518+
# Default to the max DoubleAttributeMetadata.Precision (5) so
519+
# round-trips of typical scientific/ML floats don't silently
520+
# truncate; "float" implies precision, unlike money/decimal.
521+
"Precision": 5,
519522
}
520523
if dtype_l in ("datetime", "date"):
521524
return {
@@ -772,10 +775,13 @@ def _build_get_relationship(self, schema_name: str) -> _RawRequest:
772775

773776
# ----------------------- SQL guardrail patterns --------------------
774777
_SQL_WRITE_RE = re.compile(
775-
r"^\s*(?:INSERT|UPDATE|DELETE|DROP|TRUNCATE|ALTER|CREATE|EXEC|GRANT|REVOKE|BULK)\b",
778+
r"\b(?:INSERT|UPDATE|DELETE|DROP|TRUNCATE|ALTER|CREATE|EXEC|GRANT|REVOKE|BULK)\b",
776779
re.IGNORECASE,
777780
)
778781
_SQL_COMMENT_RE = re.compile(r"/\*[^*]*\*+(?:[^/*][^*]*\*+)*/|--[^\n]*", re.DOTALL)
782+
# Mask single-quoted string literals (with '' escape) before keyword scans
783+
# so values like 'DELETE ME' or ';' inside a WHERE clause don't false-positive.
784+
_SQL_STRING_LITERAL_RE = re.compile(r"'(?:[^']|'')*'")
779785
_SQL_LEADING_WILDCARD_RE = re.compile(r"\bLIKE\s+'%[^']", re.IGNORECASE)
780786
_SQL_IMPLICIT_CROSS_JOIN_RE = re.compile(
781787
r"\bFROM\s+[A-Za-z0-9_]+(?:\s+[A-Za-z0-9_]+)?\s*,\s*[A-Za-z0-9_]+",
@@ -835,9 +841,24 @@ def _sql_guardrails(self, sql: str) -> str:
835841
"""
836842
# --- BLOCKED (save server round-trip) ---
837843

838-
# 1. Block writes (strip SQL comments first to catch comment-prefixed writes)
844+
# Strip comments and mask string literals before keyword/semicolon scans
845+
# so write keywords and ';' tucked inside string values don't trip the
846+
# checks, while ones hidden by comments or statement stacking do.
839847
sql_no_comments = self._SQL_COMMENT_RE.sub(" ", sql).strip()
840-
if self._SQL_WRITE_RE.search(sql_no_comments):
848+
sql_scan = self._SQL_STRING_LITERAL_RE.sub("''", sql_no_comments)
849+
850+
# 1a. Block statement stacking (semicolons separating statements).
851+
if ";" in sql_scan:
852+
raise ValidationError(
853+
"Multiple SQL statements are not supported. The Dataverse SQL "
854+
"endpoint accepts exactly one SELECT statement per call. "
855+
"Remove the ';' and submit each query separately.",
856+
subcode=VALIDATION_SQL_UNSUPPORTED_SYNTAX,
857+
)
858+
859+
# 1b. Block writes anywhere in the query (not just at the start),
860+
# to catch writes hidden after comments or earlier SELECTs.
861+
if self._SQL_WRITE_RE.search(sql_scan):
841862
raise ValidationError(
842863
"SQL endpoint is read-only. Use client.records or "
843864
"client.dataframe for write operations "

src/PowerPlatform/Dataverse/operations/records.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -431,8 +431,11 @@ def get(
431431
"""
432432
if record_id is not None:
433433
warnings.warn(
434-
"'records.get()' with a record_id is deprecated; "
435-
"use 'client.records.retrieve(table, record_id)' instead.",
434+
"'records.get(table, record_id)' is deprecated. "
435+
"For a single record by ID, use "
436+
"'client.records.retrieve(table, record_id)'. "
437+
"For a filtered list, use "
438+
"'client.records.list(table, filter=...)'.",
436439
DeprecationWarning,
437440
stacklevel=2,
438441
)

src/PowerPlatform/Dataverse/utils/_pandas.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,15 @@ def dataframe_to_records(df: pd.DataFrame, na_as_null: bool = False) -> List[Dic
3939
:param df: Input DataFrame.
4040
:param na_as_null: When False (default), missing values are omitted from each dict.
4141
When True, missing values are included as None (sends null to Dataverse, clearing the field).
42+
:raises ValueError: If ``df`` has ``MultiIndex`` columns. Tuple column keys
43+
do not round-trip through Dataverse's JSON encoder and produce a
44+
confusing error far from the call site. Flatten to string column namesfirst.
4245
"""
46+
if isinstance(df.columns, pd.MultiIndex):
47+
raise ValueError(
48+
"MultiIndex columns are not supported. Flatten to string column names first, "
49+
"for example: df.columns = ['_'.join(map(str, col)) for col in df.columns.to_flat_index()]."
50+
)
4351
records = []
4452
for row in df.to_dict(orient="records"):
4553
clean = {}

tests/unit/data/test_batch_edge_cases.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,28 @@ def test_error_code_preserved_as_service_error_code(self):
417417
_raise_top_level_batch_error(mock_resp)
418418
self.assertEqual(ctx.exception.details.get("service_error_code"), "0x80040220")
419419

420+
def test_innererror_message_appended_to_top_level(self):
421+
"""error.innererror.message surfaces alongside the vague top-level
422+
payload error, so the offending field reaches the user."""
423+
mock_resp = MagicMock()
424+
mock_resp.status_code = 400
425+
mock_resp.json.return_value = {
426+
"error": {
427+
"code": "0x80060891",
428+
"message": "Error identified in Payload provided by the user for Entity :contacts",
429+
"innererror": {
430+
"message": "Field 'birthdate' is a Date type and cannot accept a DateTime value with offset.",
431+
"type": "System.Exception",
432+
},
433+
}
434+
}
435+
mock_resp.text = json.dumps(mock_resp.json.return_value)
436+
437+
with self.assertRaises(HttpError) as ctx:
438+
_raise_top_level_batch_error(mock_resp)
439+
self.assertIn("Error identified in Payload", str(ctx.exception))
440+
self.assertIn("birthdate", str(ctx.exception))
441+
420442

421443
# ---------------------------------------------------------------------------
422444
# 8. Batch response without continue-on-error (first failure stops)

tests/unit/data/test_odata_internal.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -636,6 +636,41 @@ def test_retry_after_int_stored_in_details(self):
636636
self.client._request("get", "http://example.com/test")
637637
self.assertEqual(ctx.exception.details.get("retry_after"), 30)
638638

639+
def test_innererror_message_appended_to_top_level(self):
640+
"""error.innererror.message is appended to the top-level message so the
641+
offending field/dtype reaches the user instead of being hidden in the
642+
wire payload."""
643+
response = self._make_raw_response(
644+
400,
645+
json_data={
646+
"error": {
647+
"code": "0x80060891",
648+
"message": "Error identified in Payload provided by the user for Entity :contacts",
649+
"innererror": {
650+
"message": "Field 'birthdate' is a Date type and cannot accept a DateTime value with offset.",
651+
"type": "System.Exception",
652+
},
653+
}
654+
},
655+
)
656+
self.client._raw_request = MagicMock(return_value=response)
657+
with self.assertRaises(HttpError) as ctx:
658+
self.client._request("post", "http://example.com/contacts")
659+
self.assertIn("Error identified in Payload", str(ctx.exception))
660+
self.assertIn("birthdate", str(ctx.exception))
661+
self.assertIn("DateTime value with offset", str(ctx.exception))
662+
663+
def test_no_innererror_message_unchanged(self):
664+
"""Absence of innererror leaves the top-level message untouched."""
665+
response = self._make_raw_response(
666+
400,
667+
json_data={"error": {"code": "0x0", "message": "Bad request"}},
668+
)
669+
self.client._raw_request = MagicMock(return_value=response)
670+
with self.assertRaises(HttpError) as ctx:
671+
self.client._request("get", "http://example.com/test")
672+
self.assertEqual(ctx.exception.message, "Bad request")
673+
639674

640675
class TestCreateMultiple(unittest.TestCase):
641676
"""Unit tests for _ODataClient._create_multiple."""
@@ -1558,6 +1593,20 @@ def test_double_dtype_alias(self):
15581593
result = self.od._attribute_payload("new_Score", "double")
15591594
self.assertIn("Double", result["@odata.type"])
15601595

1596+
def test_float_dtype_precision_default_is_5(self):
1597+
"""'float' must default to the max Double precision (5) so values
1598+
like 2.718 round-trip without silent truncation. Regression guard
1599+
for the previous Precision=2 default."""
1600+
result = self.od._attribute_payload("new_Score", "float")
1601+
self.assertEqual(result["Precision"], 5)
1602+
1603+
def test_decimal_dtype_precision_unchanged_at_2(self):
1604+
"""'decimal'/'money' should keep Precision=2 because that matches
1605+
currency semantics. Regression guard so a future cleanup doesn't
1606+
quietly broaden this along with float."""
1607+
result = self.od._attribute_payload("new_Revenue", "decimal")
1608+
self.assertEqual(result["Precision"], 2)
1609+
15611610
def test_datetime_dtype(self):
15621611
"""'datetime' produces DateTimeAttributeMetadata."""
15631612
result = self.od._attribute_payload("new_CreatedDate", "datetime")

tests/unit/data/test_sql_guardrails.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,66 @@ def test_comment_prefixed_writes_blocked(self, sql):
9292
with pytest.raises(ValidationError, match="read-only"):
9393
c._sql_guardrails(sql)
9494

95+
@pytest.mark.parametrize(
96+
"sql",
97+
[
98+
# Statement-stacked writes after a SELECT and a semicolon
99+
"SELECT name FROM account; DROP TABLE account",
100+
# Write hidden after a line comment that strips a newline
101+
"SELECT name FROM account -- \nINSERT INTO account(name) VALUES('x')",
102+
# Write after a malformed/nested block-comment dance
103+
"/* /* nested */ */ DELETE FROM account",
104+
],
105+
)
106+
def test_writes_hidden_after_select_or_comments_blocked(self, sql):
107+
c = _client()
108+
with pytest.raises(ValidationError):
109+
c._sql_guardrails(sql)
110+
111+
@pytest.mark.parametrize(
112+
"sql",
113+
[
114+
"SELECT name FROM account; SELECT name FROM contact",
115+
"SELECT name FROM account;",
116+
],
117+
)
118+
def test_semicolon_stacking_blocked(self, sql):
119+
c = _client()
120+
with pytest.raises(ValidationError, match="Multiple SQL statements"):
121+
c._sql_guardrails(sql)
122+
123+
@pytest.mark.parametrize(
124+
"sql",
125+
[
126+
# Semicolon inside a string literal must not trip the check
127+
"SELECT TOP 10 name FROM account WHERE name = ';'",
128+
# Write keywords inside string literals must not trip the check
129+
"SELECT TOP 10 name FROM account WHERE name = 'DROP ME'",
130+
"SELECT TOP 10 name FROM account WHERE name = 'INSERT; DELETE'",
131+
],
132+
)
133+
def test_keywords_and_semicolons_inside_string_literals_allowed(self, sql):
134+
c = _client()
135+
result = c._sql_guardrails(sql)
136+
assert "SELECT" in result
137+
138+
@pytest.mark.parametrize(
139+
"sql",
140+
[
141+
# ZWSP, ZWNJ, ZWJ, BOM before a write keyword must still be blocked.
142+
# Python's \s does not match these, so an ^\s* anchor would miss
143+
# them; the \b-based scan must catch them at the word boundary.
144+
"​INSERT INTO account(name) VALUES('x')",
145+
"‌DELETE FROM account",
146+
"‍DROP TABLE account",
147+
"UPDATE account SET name = 'x'",
148+
],
149+
)
150+
def test_zero_width_prefixed_writes_blocked(self, sql):
151+
c = _client()
152+
with pytest.raises(ValidationError, match="read-only"):
153+
c._sql_guardrails(sql)
154+
95155

96156
# ===================================================================
97157
# 1b. Block server-rejected SQL patterns (save round-trip)

tests/unit/test_pandas_helpers.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,29 @@ def test_empty_dataframe(self):
189189
result = dataframe_to_records(df)
190190
self.assertEqual(result, [])
191191

192+
def test_multiindex_columns_raises(self):
193+
"""MultiIndex columns produce tuple keys that don't survive JSON
194+
serialization. Reject up-front with a clear message instead of
195+
letting a TypeError surface deep in the HTTP layer.
196+
"""
197+
cols = pd.MultiIndex.from_tuples([("a", "firstname"), ("a", "lastname")])
198+
df = pd.DataFrame([["x", "y"]], columns=cols)
199+
with self.assertRaises(ValueError) as ctx:
200+
dataframe_to_records(df)
201+
msg = str(ctx.exception)
202+
self.assertIn("MultiIndex", msg)
203+
# The remediation must produce string column names, not tuples.
204+
self.assertIn("'_'.join(map(str, col)) for col in df.columns.to_flat_index()", msg)
205+
206+
def test_multiindex_remediation_actually_works(self):
207+
"""The exact remediation in the error message must produce a
208+
DataFrame that dataframe_to_records can serialize."""
209+
cols = pd.MultiIndex.from_tuples([("a", "firstname"), ("a", "lastname")])
210+
df = pd.DataFrame([["x", "y"]], columns=cols)
211+
df.columns = ["_".join(map(str, col)) for col in df.columns.to_flat_index()]
212+
records = dataframe_to_records(df)
213+
self.assertEqual(records, [{"a_firstname": "x", "a_lastname": "y"}])
214+
192215
def test_mixed_types(self):
193216
"""DataFrame with mixed types (str, int, float, None, Timestamp) converts correctly."""
194217
ts = pd.Timestamp("2024-06-01")

0 commit comments

Comments
 (0)