From 27a74e9d8b4a3f97a78744455a8498a1c861e637 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 3 Jul 2026 09:39:29 +0200 Subject: [PATCH 1/2] fix: Keep named request queues across runs --- src/crawlee/crawlers/_basic/_basic_crawler.py | 18 +++++++-- .../crawlers/_basic/test_basic_crawler.py | 39 +++++++++++++++++++ 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index be3da6dd27..d838bc439b 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -692,8 +692,9 @@ async def run( Args: requests: The requests to be enqueued before the crawler starts. - purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default - request queue will be purged. + purge_request_queue: If this is `True` and the crawler is not being run for the first time, the request + queue will be purged. Named request queues are considered persistent and are never purged + implicitly. """ if self._running: raise RuntimeError( @@ -717,7 +718,18 @@ async def run( if purge_request_queue: request_manager = await self.get_request_manager() - await request_manager.purge() + # A `ThrottlingRequestManager` delegates `purge` to the manager it wraps, so inspect the wrapped + # manager when deciding whether the purge would hit a named queue. + inner_manager = ( + request_manager._inner # noqa: SLF001 + if isinstance(request_manager, ThrottlingRequestManager) + else request_manager + ) + # Named storages are persistent and shared across runs, so they are never purged implicitly + # (the same named-storage exemption as in `StorageClient._purge_if_needed`). + is_named_queue = isinstance(inner_manager, RequestQueue) and inner_manager.name is not None + if not is_named_queue: + await request_manager.purge() if requests is not None: await self.add_requests(requests) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 0391d65843..68f75f416d 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1073,6 +1073,45 @@ async def handler(context: BasicCrawlingContext) -> None: } +async def test_consecutive_runs_do_not_purge_named_request_queue() -> None: + """A second `run()` must not purge a user-supplied named request queue, as named storages persist across runs.""" + queue = await RequestQueue.open(name='persistent-queue') + crawler = BasicCrawler(request_manager=queue) + visited = list[str]() + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + visited.append(context.request.url) + + await crawler.run(['https://a.placeholder.com']) + + # Simulate new work added to the persistent queue between runs. + await queue.add_request('https://b.placeholder.com') + await crawler.run() + + assert visited == ['https://a.placeholder.com', 'https://b.placeholder.com'] + + +async def test_consecutive_runs_do_not_purge_named_queue_wrapped_in_throttling_manager() -> None: + """A second `run()` must not purge a named request queue wrapped in a `ThrottlingRequestManager`.""" + queue = await RequestQueue.open(name='persistent-queue') + throttler = ThrottlingRequestManager(inner=queue, domains=[], request_manager_opener=RequestQueue.open) + crawler = BasicCrawler(request_manager=throttler) + visited = list[str]() + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + visited.append(context.request.url) + + await crawler.run(['https://a.placeholder.com']) + + # Simulate new work added to the persistent queue between runs. + await queue.add_request('https://b.placeholder.com') + await crawler.run() + + assert visited == ['https://a.placeholder.com', 'https://b.placeholder.com'] + + @pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI') @pytest.mark.parametrize( ('statistics_log_format'), From 0cc592953af4052cadbd8540e96a405a9447c679 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 3 Jul 2026 10:41:43 +0200 Subject: [PATCH 2/2] refactor: Expose ThrottlingRequestManager.inner instead of reaching into _inner --- src/crawlee/crawlers/_basic/_basic_crawler.py | 4 +--- src/crawlee/request_loaders/_throttling_request_manager.py | 5 +++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index d838bc439b..5d95a1b7a2 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -721,9 +721,7 @@ async def run( # A `ThrottlingRequestManager` delegates `purge` to the manager it wraps, so inspect the wrapped # manager when deciding whether the purge would hit a named queue. inner_manager = ( - request_manager._inner # noqa: SLF001 - if isinstance(request_manager, ThrottlingRequestManager) - else request_manager + request_manager.inner if isinstance(request_manager, ThrottlingRequestManager) else request_manager ) # Named storages are persistent and shared across runs, so they are never purged implicitly # (the same named-storage exemption as in `StorageClient._purge_if_needed`). diff --git a/src/crawlee/request_loaders/_throttling_request_manager.py b/src/crawlee/request_loaders/_throttling_request_manager.py index 1f6909dedf..d48c49ec92 100644 --- a/src/crawlee/request_loaders/_throttling_request_manager.py +++ b/src/crawlee/request_loaders/_throttling_request_manager.py @@ -105,6 +105,11 @@ def __init__( """Set whenever a request is added or reclaimed. Lets `fetch_next_request` wake from a throttle wait early when fresh work appears, instead of sleeping for the full computed cooldown.""" + @property + def inner(self) -> TRequestManager: + """The wrapped request manager that stores requests for non-throttled domains.""" + return self._inner + @override async def drop(self) -> None: await asyncio.gather(self._inner.drop(), *(sm.drop() for sm in self._sub_managers.values()))