diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index be3da6dd27..fd56a259a4 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -179,7 +179,8 @@ class _BasicCrawlerOptions(TypedDict): """If True, the crawler stops immediately when any request handler error occurs.""" configure_logging: NotRequired[bool] - """If True, the crawler will set up logging infrastructure automatically.""" + """If True, the crawler will set up logging infrastructure automatically, unless the root logger already has + handlers configured by the host application.""" statistics_log_format: NotRequired[Literal['table', 'inline']] """If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain @@ -349,7 +350,8 @@ def __init__( abort_on_error: If True, the crawler stops immediately when any request handler error occurs. keep_alive: If True, it will keep crawler alive even if there are no requests in queue. Use `crawler.stop()` to exit the crawler. - configure_logging: If True, the crawler will set up logging infrastructure automatically. + configure_logging: If True, the crawler will set up logging infrastructure automatically, unless the root + logger already has handlers configured by the host application. statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain text log messages. respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file @@ -451,9 +453,12 @@ def __init__( # Logging setup if configure_logging: root_logger = logging.getLogger() - configure_logger(root_logger, remove_old_handlers=True) - httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger - httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING) + # Leave the loggers untouched if the host application has already configured the root logger, + # mirroring `logging.basicConfig` semantics. + if not root_logger.handlers: + configure_logger(root_logger) + httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger + httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING) self._logger = _logger or logging.getLogger(__name__) if implicit_event_manager_with_explicit_config: self._logger.warning( diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 0391d65843..526a24b980 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -20,6 +20,7 @@ import pytest from crawlee import ConcurrencySettings, Glob, service_locator +from crawlee._log_config import CrawleeLogFormatter from crawlee._request import Request, RequestState from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod from crawlee._utils.robots import RobotsTxtFile @@ -1053,6 +1054,40 @@ def test_crawler_log() -> None: crawler.log.info('Test log message') +def test_configure_logging_preserves_host_app_root_handlers() -> None: + """Constructing a crawler must not remove handlers installed on the root logger by the host application.""" + root_logger = logging.getLogger() + host_handler = logging.NullHandler() + root_logger.addHandler(host_handler) + + try: + BasicCrawler() + assert host_handler in root_logger.handlers + # The host-configured root logger must not gain Crawlee's own handler either. + assert not any(isinstance(handler.formatter, CrawleeLogFormatter) for handler in root_logger.handlers) + finally: + root_logger.removeHandler(host_handler) + + +def test_configure_logging_sets_up_unconfigured_root_logger() -> None: + """Constructing a crawler installs Crawlee's log handler when the root logger has no handlers yet.""" + root_logger = logging.getLogger() + saved_handlers = root_logger.handlers[:] + saved_level = root_logger.level + for handler in saved_handlers: + root_logger.removeHandler(handler) + + try: + BasicCrawler() + assert any(isinstance(handler.formatter, CrawleeLogFormatter) for handler in root_logger.handlers) + finally: + for handler in root_logger.handlers[:]: + root_logger.removeHandler(handler) + for handler in saved_handlers: + root_logger.addHandler(handler) + root_logger.setLevel(saved_level) + + async def test_consecutive_runs_purge_request_queue() -> None: crawler = BasicCrawler() visit = Mock()