Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ class _BasicCrawlerOptions(TypedDict):
"""If True, the crawler stops immediately when any request handler error occurs."""

configure_logging: NotRequired[bool]
"""If True, the crawler will set up logging infrastructure automatically."""
"""If True, the crawler will set up logging infrastructure automatically, unless the root logger already has
handlers configured by the host application."""

statistics_log_format: NotRequired[Literal['table', 'inline']]
"""If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain
Expand Down Expand Up @@ -349,7 +350,8 @@ def __init__(
abort_on_error: If True, the crawler stops immediately when any request handler error occurs.
keep_alive: If True, it will keep crawler alive even if there are no requests in queue.
Use `crawler.stop()` to exit the crawler.
configure_logging: If True, the crawler will set up logging infrastructure automatically.
configure_logging: If True, the crawler will set up logging infrastructure automatically, unless the root
logger already has handlers configured by the host application.
statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',
outputs statistics as plain text log messages.
respect_robots_txt_file: If set to `True`, the crawler will automatically try to fetch the robots.txt file
Expand Down Expand Up @@ -451,9 +453,12 @@ def __init__(
# Logging setup
if configure_logging:
root_logger = logging.getLogger()
configure_logger(root_logger, remove_old_handlers=True)
httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
# Leave the loggers untouched if the host application has already configured the root logger,
# mirroring `logging.basicConfig` semantics.
if not root_logger.handlers:
configure_logger(root_logger)
httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
self._logger = _logger or logging.getLogger(__name__)
if implicit_event_manager_with_explicit_config:
self._logger.warning(
Expand Down
35 changes: 35 additions & 0 deletions tests/unit/crawlers/_basic/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import pytest

from crawlee import ConcurrencySettings, Glob, service_locator
from crawlee._log_config import CrawleeLogFormatter
from crawlee._request import Request, RequestState
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod
from crawlee._utils.robots import RobotsTxtFile
Expand Down Expand Up @@ -1053,6 +1054,40 @@ def test_crawler_log() -> None:
crawler.log.info('Test log message')


def test_configure_logging_preserves_host_app_root_handlers() -> None:
"""Constructing a crawler must not remove handlers installed on the root logger by the host application."""
root_logger = logging.getLogger()
host_handler = logging.NullHandler()
root_logger.addHandler(host_handler)

try:
BasicCrawler()
assert host_handler in root_logger.handlers
# The host-configured root logger must not gain Crawlee's own handler either.
assert not any(isinstance(handler.formatter, CrawleeLogFormatter) for handler in root_logger.handlers)
finally:
root_logger.removeHandler(host_handler)


def test_configure_logging_sets_up_unconfigured_root_logger() -> None:
"""Constructing a crawler installs Crawlee's log handler when the root logger has no handlers yet."""
root_logger = logging.getLogger()
saved_handlers = root_logger.handlers[:]
saved_level = root_logger.level
for handler in saved_handlers:
root_logger.removeHandler(handler)

try:
BasicCrawler()
assert any(isinstance(handler.formatter, CrawleeLogFormatter) for handler in root_logger.handlers)
finally:
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
for handler in saved_handlers:
root_logger.addHandler(handler)
root_logger.setLevel(saved_level)


async def test_consecutive_runs_purge_request_queue() -> None:
crawler = BasicCrawler()
visit = Mock()
Expand Down
Loading