From 1db8c20273ffbe622838426aa025b8966fbcd7e6 Mon Sep 17 00:00:00 2001
From: Vlada Dusek <v.dusek96@gmail.com>
Date: Thu, 2 Jul 2026 16:30:55 +0200
Subject: [PATCH 1/2] fix: Preserve host logging, keep named request queues
 across runs, fix adaptive-crawler extra

---
 pyproject.toml                                |  1 +
 src/crawlee/crawlers/_basic/_basic_crawler.py | 33 +++++++--
 .../crawlers/_basic/test_basic_crawler.py     | 74 +++++++++++++++++++
 uv.lock                                       |  4 +
 4 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3a250c8101..e6546927bd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,7 @@ dependencies = [
 [project.optional-dependencies]
 all = ["crawlee[adaptive-crawler,pydantic-ai,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"]
 adaptive-crawler = [
+    "crawlee[beautifulsoup,parsel]",
     "jaro-winkler>=2.0.3",
     "playwright>=1.27.0",
     "scikit-learn>=1.6.0",
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index be3da6dd27..b0294ffc2f 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -179,7 +179,8 @@ class _BasicCrawlerOptions(TypedDict):
     """If True, the crawler stops immediately when any request handler error occurs."""
 
     configure_logging: NotRequired[bool]
-    """If True, the crawler will set up logging infrastructure automatically."""
+    """If True, the crawler will set up logging infrastructure automatically, unless the root logger already has
+    handlers configured by the host application."""
 
     statistics_log_format: NotRequired[Literal['table', 'inline']]
     """If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain
@@ -349,7 +350,8 @@ def __init__(
             abort_on_error: If True, the crawler stops immediately when any request handler error occurs.
             keep_alive: If True, it will keep crawler alive even if there are no requests in queue.
                 Use `crawler.stop()` to exit the crawler.
-            configure_logging: If True, the crawler will set up logging infrastructure automatically.
+            configure_logging: If True, the crawler will set up logging infrastructure automatically, unless the root
+                logger already has handlers configured by the host application.
             statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',
                 outputs statistics as plain text log messages.
             respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file
@@ -451,9 +453,12 @@ def __init__(
         # Logging setup
         if configure_logging:
             root_logger = logging.getLogger()
-            configure_logger(root_logger, remove_old_handlers=True)
-            httpx_logger = logging.getLogger('httpx')  # Silence HTTPX logger
-            httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
+            # Leave the loggers untouched if the host application has already configured the root logger,
+            # mirroring `logging.basicConfig` semantics.
+            if not root_logger.handlers:
+                configure_logger(root_logger)
+                httpx_logger = logging.getLogger('httpx')  # Silence HTTPX logger
+                httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
         self._logger = _logger or logging.getLogger(__name__)
         if implicit_event_manager_with_explicit_config:
             self._logger.warning(
@@ -692,8 +697,9 @@ async def run(
 
         Args:
             requests: The requests to be enqueued before the crawler starts.
-            purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default
-                request queue will be purged.
+            purge_request_queue: If this is `True` and the crawler is not being run for the first time, the request
+                queue will be purged. Named request queues are considered persistent and are never purged
+                implicitly.
         """
         if self._running:
             raise RuntimeError(
@@ -717,7 +723,18 @@ async def run(
 
             if purge_request_queue:
                 request_manager = await self.get_request_manager()
-                await request_manager.purge()
+                # A `ThrottlingRequestManager` delegates `purge` to the manager it wraps, so inspect the wrapped
+                # manager when deciding whether the purge would hit a named queue.
+                inner_manager = (
+                    request_manager._inner  # noqa: SLF001
+                    if isinstance(request_manager, ThrottlingRequestManager)
+                    else request_manager
+                )
+                # Named storages are persistent and shared across runs, so they are never purged implicitly
+                # (the same named-storage exemption as in `StorageClient._purge_if_needed`).
+                is_named_queue = isinstance(inner_manager, RequestQueue) and inner_manager.name is not None
+                if not is_named_queue:
+                    await request_manager.purge()
 
         if requests is not None:
             await self.add_requests(requests)
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
index 0391d65843..7f88b4f794 100644
--- a/tests/unit/crawlers/_basic/test_basic_crawler.py
+++ b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -20,6 +20,7 @@
 import pytest
 
 from crawlee import ConcurrencySettings, Glob, service_locator
+from crawlee._log_config import CrawleeLogFormatter
 from crawlee._request import Request, RequestState
 from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod
 from crawlee._utils.robots import RobotsTxtFile
@@ -1053,6 +1054,40 @@ def test_crawler_log() -> None:
     crawler.log.info('Test log message')
 
 
+def test_configure_logging_preserves_host_app_root_handlers() -> None:
+    """Constructing a crawler must not remove handlers installed on the root logger by the host application."""
+    root_logger = logging.getLogger()
+    host_handler = logging.NullHandler()
+    root_logger.addHandler(host_handler)
+
+    try:
+        BasicCrawler()
+        assert host_handler in root_logger.handlers
+        # The host-configured root logger must not gain Crawlee's own handler either.
+        assert not any(isinstance(handler.formatter, CrawleeLogFormatter) for handler in root_logger.handlers)
+    finally:
+        root_logger.removeHandler(host_handler)
+
+
+def test_configure_logging_sets_up_unconfigured_root_logger() -> None:
+    """Constructing a crawler installs Crawlee's log handler when the root logger has no handlers yet."""
+    root_logger = logging.getLogger()
+    saved_handlers = root_logger.handlers[:]
+    saved_level = root_logger.level
+    for handler in saved_handlers:
+        root_logger.removeHandler(handler)
+
+    try:
+        BasicCrawler()
+        assert any(isinstance(handler.formatter, CrawleeLogFormatter) for handler in root_logger.handlers)
+    finally:
+        for handler in root_logger.handlers[:]:
+            root_logger.removeHandler(handler)
+        for handler in saved_handlers:
+            root_logger.addHandler(handler)
+        root_logger.setLevel(saved_level)
+
+
 async def test_consecutive_runs_purge_request_queue() -> None:
     crawler = BasicCrawler()
     visit = Mock()
@@ -1073,6 +1108,45 @@ async def handler(context: BasicCrawlingContext) -> None:
     }
 
 
+async def test_consecutive_runs_do_not_purge_named_request_queue() -> None:
+    """A second `run()` must not purge a user-supplied named request queue, as named storages persist across runs."""
+    queue = await RequestQueue.open(name='persistent-queue')
+    crawler = BasicCrawler(request_manager=queue)
+    visited = list[str]()
+
+    @crawler.router.default_handler
+    async def handler(context: BasicCrawlingContext) -> None:
+        visited.append(context.request.url)
+
+    await crawler.run(['https://a.placeholder.com'])
+
+    # Simulate new work added to the persistent queue between runs.
+    await queue.add_request('https://b.placeholder.com')
+    await crawler.run()
+
+    assert visited == ['https://a.placeholder.com', 'https://b.placeholder.com']
+
+
+async def test_consecutive_runs_do_not_purge_named_queue_wrapped_in_throttling_manager() -> None:
+    """A second `run()` must not purge a named request queue wrapped in a `ThrottlingRequestManager`."""
+    queue = await RequestQueue.open(name='persistent-queue')
+    throttler = ThrottlingRequestManager(inner=queue, domains=[], request_manager_opener=RequestQueue.open)
+    crawler = BasicCrawler(request_manager=throttler)
+    visited = list[str]()
+
+    @crawler.router.default_handler
+    async def handler(context: BasicCrawlingContext) -> None:
+        visited.append(context.request.url)
+
+    await crawler.run(['https://a.placeholder.com'])
+
+    # Simulate new work added to the persistent queue between runs.
+    await queue.add_request('https://b.placeholder.com')
+    await crawler.run()
+
+    assert visited == ['https://a.placeholder.com', 'https://b.placeholder.com']
+
+
 @pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI')
 @pytest.mark.parametrize(
     ('statistics_log_format'),
diff --git a/uv.lock b/uv.lock
index 3bdfb65f4e..916c85b83c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -794,8 +794,11 @@ dependencies = [
 [package.optional-dependencies]
 adaptive-crawler = [
     { name = "apify-fingerprint-datapoints" },
+    { name = "beautifulsoup4", extra = ["lxml"] },
     { name = "browserforge" },
+    { name = "html5lib" },
     { name = "jaro-winkler" },
+    { name = "parsel" },
     { name = "playwright" },
     { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -944,6 +947,7 @@ requires-dist = [
     { name = "colorama", specifier = ">=0.4.0" },
     { name = "cookiecutter", marker = "extra == 'cli'", specifier = ">=2.6.0" },
     { name = "crawlee", extras = ["adaptive-crawler", "pydantic-ai", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-sqlite", "sql-postgres", "sql-mysql", "stagehand", "redis"], marker = "extra == 'all'" },
+    { name = "crawlee", extras = ["beautifulsoup", "parsel"], marker = "extra == 'adaptive-crawler'" },
     { name = "cryptography", marker = "extra == 'sql-mysql'", specifier = ">=46.0.5" },
     { name = "curl-cffi", marker = "extra == 'curl-impersonate'", specifier = ">=0.9.0" },
     { name = "html5lib", marker = "extra == 'beautifulsoup'", specifier = ">=1.0" },

From 7781398f357897bca4efd49246246e235b187a9b Mon Sep 17 00:00:00 2001
From: Vlada Dusek <v.dusek96@gmail.com>
Date: Fri, 3 Jul 2026 09:44:34 +0200
Subject: [PATCH 2/2] chore: Split named-queue and adaptive-crawler fixes into
 separate PRs

---
 pyproject.toml                                |  1 -
 src/crawlee/crawlers/_basic/_basic_crawler.py | 18 ++-------
 .../crawlers/_basic/test_basic_crawler.py     | 39 -------------------
 uv.lock                                       |  4 --
 4 files changed, 3 insertions(+), 59 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e6546927bd..3a250c8101 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,6 @@ dependencies = [
 [project.optional-dependencies]
 all = ["crawlee[adaptive-crawler,pydantic-ai,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,sql_mysql,stagehand,redis]"]
 adaptive-crawler = [
-    "crawlee[beautifulsoup,parsel]",
     "jaro-winkler>=2.0.3",
     "playwright>=1.27.0",
     "scikit-learn>=1.6.0",
diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index b0294ffc2f..fd56a259a4 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -697,9 +697,8 @@ async def run(
 
         Args:
             requests: The requests to be enqueued before the crawler starts.
-            purge_request_queue: If this is `True` and the crawler is not being run for the first time, the request
-                queue will be purged. Named request queues are considered persistent and are never purged
-                implicitly.
+            purge_request_queue: If this is `True` and the crawler is not being run for the first time, the default
+                request queue will be purged.
         """
         if self._running:
             raise RuntimeError(
@@ -723,18 +722,7 @@ async def run(
 
             if purge_request_queue:
                 request_manager = await self.get_request_manager()
-                # A `ThrottlingRequestManager` delegates `purge` to the manager it wraps, so inspect the wrapped
-                # manager when deciding whether the purge would hit a named queue.
-                inner_manager = (
-                    request_manager._inner  # noqa: SLF001
-                    if isinstance(request_manager, ThrottlingRequestManager)
-                    else request_manager
-                )
-                # Named storages are persistent and shared across runs, so they are never purged implicitly
-                # (the same named-storage exemption as in `StorageClient._purge_if_needed`).
-                is_named_queue = isinstance(inner_manager, RequestQueue) and inner_manager.name is not None
-                if not is_named_queue:
-                    await request_manager.purge()
+                await request_manager.purge()
 
         if requests is not None:
             await self.add_requests(requests)
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
index 7f88b4f794..526a24b980 100644
--- a/tests/unit/crawlers/_basic/test_basic_crawler.py
+++ b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -1108,45 +1108,6 @@ async def handler(context: BasicCrawlingContext) -> None:
     }
 
 
-async def test_consecutive_runs_do_not_purge_named_request_queue() -> None:
-    """A second `run()` must not purge a user-supplied named request queue, as named storages persist across runs."""
-    queue = await RequestQueue.open(name='persistent-queue')
-    crawler = BasicCrawler(request_manager=queue)
-    visited = list[str]()
-
-    @crawler.router.default_handler
-    async def handler(context: BasicCrawlingContext) -> None:
-        visited.append(context.request.url)
-
-    await crawler.run(['https://a.placeholder.com'])
-
-    # Simulate new work added to the persistent queue between runs.
-    await queue.add_request('https://b.placeholder.com')
-    await crawler.run()
-
-    assert visited == ['https://a.placeholder.com', 'https://b.placeholder.com']
-
-
-async def test_consecutive_runs_do_not_purge_named_queue_wrapped_in_throttling_manager() -> None:
-    """A second `run()` must not purge a named request queue wrapped in a `ThrottlingRequestManager`."""
-    queue = await RequestQueue.open(name='persistent-queue')
-    throttler = ThrottlingRequestManager(inner=queue, domains=[], request_manager_opener=RequestQueue.open)
-    crawler = BasicCrawler(request_manager=throttler)
-    visited = list[str]()
-
-    @crawler.router.default_handler
-    async def handler(context: BasicCrawlingContext) -> None:
-        visited.append(context.request.url)
-
-    await crawler.run(['https://a.placeholder.com'])
-
-    # Simulate new work added to the persistent queue between runs.
-    await queue.add_request('https://b.placeholder.com')
-    await crawler.run()
-
-    assert visited == ['https://a.placeholder.com', 'https://b.placeholder.com']
-
-
 @pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI')
 @pytest.mark.parametrize(
     ('statistics_log_format'),
diff --git a/uv.lock b/uv.lock
index 916c85b83c..3bdfb65f4e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -794,11 +794,8 @@ dependencies = [
 [package.optional-dependencies]
 adaptive-crawler = [
     { name = "apify-fingerprint-datapoints" },
-    { name = "beautifulsoup4", extra = ["lxml"] },
     { name = "browserforge" },
-    { name = "html5lib" },
     { name = "jaro-winkler" },
-    { name = "parsel" },
     { name = "playwright" },
     { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
@@ -947,7 +944,6 @@ requires-dist = [
     { name = "colorama", specifier = ">=0.4.0" },
     { name = "cookiecutter", marker = "extra == 'cli'", specifier = ">=2.6.0" },
     { name = "crawlee", extras = ["adaptive-crawler", "pydantic-ai", "beautifulsoup", "cli", "curl-impersonate", "httpx", "parsel", "playwright", "otel", "sql-sqlite", "sql-postgres", "sql-mysql", "stagehand", "redis"], marker = "extra == 'all'" },
-    { name = "crawlee", extras = ["beautifulsoup", "parsel"], marker = "extra == 'adaptive-crawler'" },
     { name = "cryptography", marker = "extra == 'sql-mysql'", specifier = ">=46.0.5" },
     { name = "curl-cffi", marker = "extra == 'curl-impersonate'", specifier = ">=0.9.0" },
     { name = "html5lib", marker = "extra == 'beautifulsoup'", specifier = ">=1.0" },