diff --git a/src/crawlee/_utils/requests.py b/src/crawlee/_utils/requests.py index fa31d4621d..9745cdb73d 100644 --- a/src/crawlee/_utils/requests.py +++ b/src/crawlee/_utils/requests.py @@ -20,7 +20,7 @@ def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str: converting the scheme and netloc to lower case, stripping unwanted tracking parameters (specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically, and optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally - identical but differ in trivial ways (such as parameter order or casing) are treated as the same. + identical but differ in trivial ways (such as parameter order or scheme/host casing) are treated as the same. Args: url: The URL to be normalized. @@ -44,7 +44,7 @@ def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str: yarl_new_url.path.removesuffix('/'), keep_query=True, keep_fragment=keep_url_fragment ) - return str(yarl_new_url).lower() + return str(yarl_new_url) def compute_unique_key( diff --git a/tests/unit/_utils/test_requests.py b/tests/unit/_utils/test_requests.py index 8198909592..2db4ee9dde 100644 --- a/tests/unit/_utils/test_requests.py +++ b/tests/unit/_utils/test_requests.py @@ -15,7 +15,9 @@ 'http://example.com/?another_key=another_value&key=value', False, ), - ('HTTPS://EXAMPLE.COM/?KEY=VALUE', 'https://example.com/?key=value', False), + ('HTTPS://EXAMPLE.COM/?KEY=VALUE', 'https://example.com/?KEY=VALUE', False), + ('HTTPS://EXAMPLE.COM/Product/ABC?token=SeCrEt', 'https://example.com/Product/ABC?token=SeCrEt', False), + ('HTTP://EXAMPLE.COM/Path#Frag', 'http://example.com/Path#Frag', True), ('', '', False), ('http://example.com/#fragment', 'http://example.com/#fragment', True), ('http://example.com/#fragment', 'http://example.com', False), @@ -26,6 +28,8 @@ 'remove_utm_params', 'retain_sort_non_utm_params', 'convert_scheme_netloc_to_lowercase', + 'preserve_path_query_case', + 'preserve_fragment_case', 'handle_empty_url', 'retain_fragment', 'remove_fragment', @@ -38,6 +42,19 @@ def test_normalize_url(url: str, expected_output: str, *, keep_url_fragment: boo assert output == expected_output +@pytest.mark.parametrize( + ('first_url', 'second_url'), + [ + ('https://example.com/Product/ABC', 'https://example.com/product/abc'), + ('https://example.com/?token=SeCrEt', 'https://example.com/?token=secret'), + ('https://example.com/?Token=secret', 'https://example.com/?token=secret'), + ], + ids=['path_key', 'query_value_key', 'query_name_key'], +) +def test_compute_unique_key_preserves_case_sensitive_path_and_query(first_url: str, second_url: str) -> None: + assert compute_unique_key(first_url) != compute_unique_key(second_url) + + def test_compute_unique_key_basic() -> None: url = 'https://crawlee.dev' uk_get = compute_unique_key(url, method='GET')