diff --git a/src/crawlee/browsers/_playwright_browser_controller.py b/src/crawlee/browsers/_playwright_browser_controller.py index f386a7a903..c773c9eefe 100644 --- a/src/crawlee/browsers/_playwright_browser_controller.py +++ b/src/crawlee/browsers/_playwright_browser_controller.py @@ -2,12 +2,15 @@ from __future__ import annotations +import inspect from asyncio import Lock from datetime import datetime, timedelta, timezone +from functools import lru_cache from typing import TYPE_CHECKING, Any, cast from browserforge.injectors.playwright import AsyncNewContext from playwright.async_api import Browser, BrowserContext, Page, ProxySettings +from playwright.async_api import BrowserType as PlaywrightBrowserType from typing_extensions import override from crawlee._utils.docs import docs_group @@ -28,6 +31,18 @@ logger = getLogger(__name__) +# Cache Playwright signatures to avoid overhead in critical path +@lru_cache(maxsize=1) +def _get_context_params_cache() -> dict[str, set[str]]: + launch_persistent_params = set(inspect.signature(PlaywrightBrowserType.launch_persistent_context).parameters) + new_context_params = set(inspect.signature(Browser.new_context).parameters) + return { + 'common': launch_persistent_params & new_context_params, + 'persistent_unique': launch_persistent_params - new_context_params, + 'incognito_unique': new_context_params - launch_persistent_params, + } + + @docs_group('Browser management') class PlaywrightBrowserController(BrowserController): """Controller for managing Playwright browser instances and their pages. @@ -209,6 +224,31 @@ def _on_page_close(self, page: Page) -> None: """Handle actions after a page is closed.""" self._pages.remove(page) + def _filter_context_options(self, options: dict[str, Any]) -> dict[str, Any]: + """Filter browser context options based on the current mode (incognito vs persistent). + + Options that are valid only in the other mode are dropped with a warning. Unrecognized options are kept + and passed through so that Playwright itself can raise an appropriate error. + """ + params_cache = _get_context_params_cache() + filtered = dict[str, Any]() + + for key, value in options.items(): + if self._use_incognito_pages and key in params_cache['persistent_unique']: + logger.warning( + f'Option "{key}" is only supported in persistent context mode ' + '(use_incognito_pages=False) and will be ignored.' + ) + elif not self._use_incognito_pages and key in params_cache['incognito_unique']: + logger.warning( + f'Option "{key}" is only supported in incognito context mode ' + '(use_incognito_pages=True) and will be ignored.' + ) + else: + filtered[key] = value + + return filtered + async def _create_browser_context( self, browser_new_context_options: Mapping[str, Any] | None = None, @@ -222,11 +262,14 @@ async def _create_browser_context( `self._fingerprint_generator` is available. """ browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {} + + filtered_options = self._filter_context_options(browser_new_context_options) + if proxy_info: - if browser_new_context_options.get('proxy'): + if filtered_options.get('proxy'): logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.") - browser_new_context_options['proxy'] = ProxySettings( + filtered_options['proxy'] = ProxySettings( server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}', username=proxy_info.username, password=proxy_info.password, @@ -236,7 +279,7 @@ async def _create_browser_context( return await AsyncNewContext( browser=self._browser, fingerprint=self._fingerprint_generator.generate(), - **browser_new_context_options, + **filtered_options, ) if self._header_generator: @@ -256,7 +299,5 @@ async def _create_browser_context( else: extra_http_headers = None - browser_new_context_options['extra_http_headers'] = browser_new_context_options.get( - 'extra_http_headers', extra_http_headers - ) - return await self._browser.new_context(**browser_new_context_options) + filtered_options['extra_http_headers'] = filtered_options.get('extra_http_headers', extra_http_headers) + return await self._browser.new_context(**filtered_options) diff --git a/tests/unit/browsers/test_playwright_controller_validation.py b/tests/unit/browsers/test_playwright_controller_validation.py new file mode 100644 index 0000000000..d6ee3e441b --- /dev/null +++ b/tests/unit/browsers/test_playwright_controller_validation.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +import pytest +from playwright.async_api import Browser, Playwright, async_playwright + +from crawlee.browsers import PlaywrightBrowserController + +if TYPE_CHECKING: + from collections.abc import AsyncGenerator + + +@pytest.fixture +async def playwright() -> AsyncGenerator[Playwright, None]: + async with async_playwright() as playwright: + yield playwright + + +@pytest.fixture +async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]: + browser = await playwright.chromium.launch() + yield browser + await browser.close() + + +async def test_controller_validation_typo_passed_through(browser: Browser) -> None: + """Invalid options (e.g. typos) are passed through so Playwright raises its own error.""" + controller = PlaywrightBrowserController(browser) + with pytest.raises(TypeError): + await controller.new_page(browser_new_context_options={'headles': True}) + await controller.close() + + +async def test_controller_validation_cross_mode_persistent(browser: Browser, caplog: pytest.LogCaptureFixture) -> None: + # Default is persistent mode (use_incognito_pages=False) + controller = PlaywrightBrowserController(browser, use_incognito_pages=False) + # storage_state is incognito-only + with caplog.at_level(logging.WARNING): + page = await controller.new_page(browser_new_context_options={'storage_state': {'cookies': [], 'origins': []}}) + assert 'Option "storage_state" is only supported in incognito context mode' in caplog.text + await page.close() + await controller.close() + + +async def test_controller_validation_cross_mode_incognito(browser: Browser, caplog: pytest.LogCaptureFixture) -> None: + controller = PlaywrightBrowserController(browser, use_incognito_pages=True) + # env is persistent-only + with caplog.at_level(logging.WARNING): + page = await controller.new_page(browser_new_context_options={'env': {}}) + assert 'Option "env" is only supported in persistent context mode' in caplog.text + await page.close() + await controller.close() + + +async def test_controller_validation_valid_common(browser: Browser) -> None: + controller = PlaywrightBrowserController(browser) + # viewport is common + page = await controller.new_page(browser_new_context_options={'viewport': {'width': 800, 'height': 600}}) + assert page.viewport_size == {'width': 800, 'height': 600} + await page.close() + await controller.close()