From 1db8c20273ffbe622838426aa025b8966fbcd7e6 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 2 Jul 2026 16:30:55 +0200 Subject: [PATCH 1/2] fix: Preserve host logging, keep named request queues across runs, fix adaptive-crawler extra --- pyproject.toml | 1 + src/crawlee/crawlers/_basic/_basic_crawler.py | 33 +++++++-- .../crawlers/_basic/test_basic_crawler.py | 74 +++++++++++++++++++ uv.lock | 4 + 4 files changed, 104 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3a250c8101..e6546927bd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ dependencies = [ [project.optional-dependencies] all = ["crawlee[adaptive-crawler,pydantic-ai,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"] adaptive-crawler = [ + "crawlee[beautifulsoup,parsel]", "jaro-winkler>=2.0.3", "playwright>=1.27.0", "scikit-learn>=1.6.0", diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index be3da6dd27..b0294ffc2f 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -179,7 +179,8 @@ class _BasicCrawlerOptions(TypedDict): """If True, the crawler stops immediately when any request handler error occurs.""" configure_logging: NotRequired[bool] - """If True, the crawler will set up logging infrastructure automatically.""" + """If True, the crawler will set up logging infrastructure automatically, unless the root logger already has + handlers configured by the host application.""" statistics_log_format: NotRequired[Literal['table', 'inline']] """If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain @@ -349,7 +350,8 @@ def __init__( abort_on_error: If True, the crawler stops immediately when any request handler error occurs. keep_alive: If True, it will keep crawler alive even if there are no requests in queue. Use `crawler.stop()` to exit the crawler. - configure_logging: If True, the crawler will set up logging infrastructure automatically. + configure_logging: If True, the crawler will set up logging infrastructure automatically, unless the root + logger already has handlers configured by the host application. statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain text log messages. respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file @@ -451,9 +453,12 @@ def __init__( # Logging setup if configure_logging: root_logger = logging.getLogger() - configure_logger(root_logger, remove_old_handlers=True) - httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger - httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING) + # Leave the loggers untouched if the host application has already configured the root logger, + # mirroring `logging.basicConfig` semantics. + if not root_logger.handlers: + configure_logger(root_logger) + httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger + httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING) self._logger = _logger or logging.getLogger(__name__) if implicit_event_manager_with_explicit_config: self._logger.warning( @@ -692,8 +697,9 @@ async def run( Args: requests: The requests to be enqueued before the crawler starts. - purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default - request queue will be purged. + purge_request_queue: If this is `True` and the crawler is not being run for the first time, the request + queue will be purged. Named request queues are considered persistent and are never purged + implicitly. """ if self._running: raise RuntimeError( @@ -717,7 +723,18 @@ async def run( if purge_request_queue: request_manager = await self.get_request_manager() - await request_manager.purge() + # A `ThrottlingRequestManager` delegates `purge` to the manager it wraps, so inspect the wrapped + # manager when deciding whether the purge would hit a named queue. + inner_manager = ( + request_manager._inner # noqa: SLF001 + if isinstance(request_manager, ThrottlingRequestManager) + else request_manager + ) + # Named storages are persistent and shared across runs, so they are never purged implicitly + # (the same named-storage exemption as in `StorageClient._purge_if_needed`). + is_named_queue = isinstance(inner_manager, RequestQueue) and inner_manager.name is not None + if not is_named_queue: + await request_manager.purge() if requests is not None: await self.add_requests(requests) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 0391d65843..7f88b4f794 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -20,6 +20,7 @@ import pytest from crawlee import ConcurrencySettings, Glob, service_locator +from crawlee._log_config import CrawleeLogFormatter from crawlee._request import Request, RequestState from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod from crawlee._utils.robots import RobotsTxtFile @@ -1053,6 +1054,40 @@ def test_crawler_log() -> None: crawler.log.info('Test log message') +def test_configure_logging_preserves_host_app_root_handlers() -> None: + """Constructing a crawler must not remove handlers installed on the root logger by the host application.""" + root_logger = logging.getLogger() + host_handler = logging.NullHandler() + root_logger.addHandler(host_handler) + + try: + BasicCrawler() + assert host_handler in root_logger.handlers + # The host-configured root logger must not gain Crawlee's own handler either. + assert not any(isinstance(handler.formatter, CrawleeLogFormatter) for handler in root_logger.handlers) + finally: + root_logger.removeHandler(host_handler) + + +def test_configure_logging_sets_up_unconfigured_root_logger() -> None: + """Constructing a crawler installs Crawlee's log handler when the root logger has no handlers yet.""" + root_logger = logging.getLogger() + saved_handlers = root_logger.handlers[:] + saved_level = root_logger.level + for handler in saved_handlers: + root_logger.removeHandler(handler) + + try: + BasicCrawler() + assert any(isinstance(handler.formatter, CrawleeLogFormatter) for handler in root_logger.handlers) + finally: + for handler in root_logger.handlers[:]: + root_logger.removeHandler(handler) + for handler in saved_handlers: + root_logger.addHandler(handler) + root_logger.setLevel(saved_level) + + async def test_consecutive_runs_purge_request_queue() -> None: crawler = BasicCrawler() visit = Mock() @@ -1073,6 +1108,45 @@ async def handler(context: BasicCrawlingContext) -> None: } +async def test_consecutive_runs_do_not_purge_named_request_queue() -> None: + """A second `run()` must not purge a user-supplied named request queue, as named storages persist across runs.""" + queue = await RequestQueue.open(name='persistent-queue') + crawler = BasicCrawler(request_manager=queue) + visited = list[str]() + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + visited.append(context.request.url) + + await crawler.run(['https://a.placeholder.com']) + + # Simulate new work added to the persistent queue between runs. + await queue.add_request('https://b.placeholder.com') + await crawler.run() + + assert visited == ['https://a.placeholder.com', 'https://b.placeholder.com'] + + +async def test_consecutive_runs_do_not_purge_named_queue_wrapped_in_throttling_manager() -> None: + """A second `run()` must not purge a named request queue wrapped in a `ThrottlingRequestManager`.""" + queue = await RequestQueue.open(name='persistent-queue') + throttler = ThrottlingRequestManager(inner=queue, domains=[], request_manager_opener=RequestQueue.open) + crawler = BasicCrawler(request_manager=throttler) + visited = list[str]() + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + visited.append(context.request.url) + + await crawler.run(['https://a.placeholder.com']) + + # Simulate new work added to the persistent queue between runs. + await queue.add_request('https://b.placeholder.com') + await crawler.run() + + assert visited == ['https://a.placeholder.com', 'https://b.placeholder.com'] + + @pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI') @pytest.mark.parametrize( ('statistics_log_format'), diff --git a/uv.lock b/uv.lock index 3bdfb65f4e..916c85b83c 100644 --- a/uv.lock +++ b/uv.lock @@ -794,8 +794,11 @@ dependencies = [ [package.optional-dependencies] adaptive-crawler = [ { name = "apify-fingerprint-datapoints" }, + { name = "beautifulsoup4", extra = ["lxml"] }, { name = "browserforge" }, + { name = "html5lib" }, { name = "jaro-winkler" }, + { name = "parsel" }, { name = "playwright" }, { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -944,6 +947,7 @@ requires-dist = [ { name = "colorama", specifier = ">=0.4.0" }, { name = "cookiecutter", marker = "extra == 'cli'", specifier = ">=2.6.0" }, { name = "crawlee", extras = ["adaptive-crawler", "pydantic-ai", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-sqlite", "sql-postgres", "sql-mysql", "stagehand", "redis"], marker = "extra == 'all'" }, + { name = "crawlee", extras = ["beautifulsoup", "parsel"], marker = "extra == 'adaptive-crawler'" }, { name = "cryptography", marker = "extra == 'sql-mysql'", specifier = ">=46.0.5" }, { name = "curl-cffi", marker = "extra == 'curl-impersonate'", specifier = ">=0.9.0" }, { name = "html5lib", marker = "extra == 'beautifulsoup'", specifier = ">=1.0" }, From 7781398f357897bca4efd49246246e235b187a9b Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 3 Jul 2026 09:44:34 +0200 Subject: [PATCH 2/2] chore: Split named-queue and adaptive-crawler fixes into separate PRs --- pyproject.toml | 1 - src/crawlee/crawlers/_basic/_basic_crawler.py | 18 ++------- .../crawlers/_basic/test_basic_crawler.py | 39 ------------------- uv.lock | 4 -- 4 files changed, 3 insertions(+), 59 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e6546927bd..3a250c8101 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,6 @@ dependencies = [ [project.optional-dependencies] all = ["crawlee[adaptive-crawler,pydantic-ai,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"] adaptive-crawler = [ - "crawlee[beautifulsoup,parsel]", "jaro-winkler>=2.0.3", "playwright>=1.27.0", "scikit-learn>=1.6.0", diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index b0294ffc2f..fd56a259a4 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -697,9 +697,8 @@ async def run( Args: requests: The requests to be enqueued before the crawler starts. - purge_request_queue: If this is `True` and the crawler is not being run for the first time, the request - queue will be purged. Named request queues are considered persistent and are never purged - implicitly. + purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default + request queue will be purged. """ if self._running: raise RuntimeError( @@ -723,18 +722,7 @@ async def run( if purge_request_queue: request_manager = await self.get_request_manager() - # A `ThrottlingRequestManager` delegates `purge` to the manager it wraps, so inspect the wrapped - # manager when deciding whether the purge would hit a named queue. - inner_manager = ( - request_manager._inner # noqa: SLF001 - if isinstance(request_manager, ThrottlingRequestManager) - else request_manager - ) - # Named storages are persistent and shared across runs, so they are never purged implicitly - # (the same named-storage exemption as in `StorageClient._purge_if_needed`). - is_named_queue = isinstance(inner_manager, RequestQueue) and inner_manager.name is not None - if not is_named_queue: - await request_manager.purge() + await request_manager.purge() if requests is not None: await self.add_requests(requests) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 7f88b4f794..526a24b980 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1108,45 +1108,6 @@ async def handler(context: BasicCrawlingContext) -> None: } -async def test_consecutive_runs_do_not_purge_named_request_queue() -> None: - """A second `run()` must not purge a user-supplied named request queue, as named storages persist across runs.""" - queue = await RequestQueue.open(name='persistent-queue') - crawler = BasicCrawler(request_manager=queue) - visited = list[str]() - - @crawler.router.default_handler - async def handler(context: BasicCrawlingContext) -> None: - visited.append(context.request.url) - - await crawler.run(['https://a.placeholder.com']) - - # Simulate new work added to the persistent queue between runs. - await queue.add_request('https://b.placeholder.com') - await crawler.run() - - assert visited == ['https://a.placeholder.com', 'https://b.placeholder.com'] - - -async def test_consecutive_runs_do_not_purge_named_queue_wrapped_in_throttling_manager() -> None: - """A second `run()` must not purge a named request queue wrapped in a `ThrottlingRequestManager`.""" - queue = await RequestQueue.open(name='persistent-queue') - throttler = ThrottlingRequestManager(inner=queue, domains=[], request_manager_opener=RequestQueue.open) - crawler = BasicCrawler(request_manager=throttler) - visited = list[str]() - - @crawler.router.default_handler - async def handler(context: BasicCrawlingContext) -> None: - visited.append(context.request.url) - - await crawler.run(['https://a.placeholder.com']) - - # Simulate new work added to the persistent queue between runs. - await queue.add_request('https://b.placeholder.com') - await crawler.run() - - assert visited == ['https://a.placeholder.com', 'https://b.placeholder.com'] - - @pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI') @pytest.mark.parametrize( ('statistics_log_format'), diff --git a/uv.lock b/uv.lock index 916c85b83c..3bdfb65f4e 100644 --- a/uv.lock +++ b/uv.lock @@ -794,11 +794,8 @@ dependencies = [ [package.optional-dependencies] adaptive-crawler = [ { name = "apify-fingerprint-datapoints" }, - { name = "beautifulsoup4", extra = ["lxml"] }, { name = "browserforge" }, - { name = "html5lib" }, { name = "jaro-winkler" }, - { name = "parsel" }, { name = "playwright" }, { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, @@ -947,7 +944,6 @@ requires-dist = [ { name = "colorama", specifier = ">=0.4.0" }, { name = "cookiecutter", marker = "extra == 'cli'", specifier = ">=2.6.0" }, { name = "crawlee", extras = ["adaptive-crawler", "pydantic-ai", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-sqlite", "sql-postgres", "sql-mysql", "stagehand", "redis"], marker = "extra == 'all'" }, - { name = "crawlee", extras = ["beautifulsoup", "parsel"], marker = "extra == 'adaptive-crawler'" }, { name = "cryptography", marker = "extra == 'sql-mysql'", specifier = ">=46.0.5" }, { name = "curl-cffi", marker = "extra == 'curl-impersonate'", specifier = ">=0.9.0" }, { name = "html5lib", marker = "extra == 'beautifulsoup'", specifier = ">=1.0" },