Skip to content
55 changes: 48 additions & 7 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@

from __future__ import annotations

import inspect
from asyncio import Lock
from datetime import datetime, timedelta, timezone
from functools import lru_cache
from typing import TYPE_CHECKING, Any, cast

from browserforge.injectors.playwright import AsyncNewContext
from playwright.async_api import Browser, BrowserContext, Page, ProxySettings
from playwright.async_api import BrowserType as PlaywrightBrowserType
from typing_extensions import override

from crawlee._utils.docs import docs_group
Expand All @@ -28,6 +31,18 @@
logger = getLogger(__name__)


# Cache Playwright signatures to avoid overhead in critical path
@lru_cache(maxsize=1)
def _get_context_params_cache() -> dict[str, set[str]]:
launch_persistent_params = set(inspect.signature(PlaywrightBrowserType.launch_persistent_context).parameters)
new_context_params = set(inspect.signature(Browser.new_context).parameters)
return {
'common': launch_persistent_params & new_context_params,
'persistent_unique': launch_persistent_params - new_context_params,
'incognito_unique': new_context_params - launch_persistent_params,
}


@docs_group('Browser management')
class PlaywrightBrowserController(BrowserController):
"""Controller for managing Playwright browser instances and their pages.
Expand Down Expand Up @@ -209,6 +224,31 @@ def _on_page_close(self, page: Page) -> None:
"""Handle actions after a page is closed."""
self._pages.remove(page)

def _filter_context_options(self, options: dict[str, Any]) -> dict[str, Any]:
"""Filter browser context options based on the current mode (incognito vs persistent).

Options that are valid only in the other mode are dropped with a warning. Unrecognized options are kept
and passed through so that Playwright itself can raise an appropriate error.
"""
params_cache = _get_context_params_cache()
filtered = dict[str, Any]()

for key, value in options.items():
if self._use_incognito_pages and key in params_cache['persistent_unique']:
logger.warning(
f'Option "{key}" is only supported in persistent context mode '
'(use_incognito_pages=False) and will be ignored.'
)
elif not self._use_incognito_pages and key in params_cache['incognito_unique']:
logger.warning(
f'Option "{key}" is only supported in incognito context mode '
'(use_incognito_pages=True) and will be ignored.'
)
else:
filtered[key] = value

return filtered

async def _create_browser_context(
self,
browser_new_context_options: Mapping[str, Any] | None = None,
Expand All @@ -222,11 +262,14 @@ async def _create_browser_context(
`self._fingerprint_generator` is available.
"""
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}

filtered_options = self._filter_context_options(browser_new_context_options)

if proxy_info:
if browser_new_context_options.get('proxy'):
if filtered_options.get('proxy'):
logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")

browser_new_context_options['proxy'] = ProxySettings(
filtered_options['proxy'] = ProxySettings(
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
username=proxy_info.username,
password=proxy_info.password,
Expand All @@ -236,7 +279,7 @@ async def _create_browser_context(
return await AsyncNewContext(
browser=self._browser,
fingerprint=self._fingerprint_generator.generate(),
**browser_new_context_options,
**filtered_options,
)

if self._header_generator:
Expand All @@ -256,7 +299,5 @@ async def _create_browser_context(
else:
extra_http_headers = None

browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
'extra_http_headers', extra_http_headers
)
return await self._browser.new_context(**browser_new_context_options)
filtered_options['extra_http_headers'] = filtered_options.get('extra_http_headers', extra_http_headers)
return await self._browser.new_context(**filtered_options)
63 changes: 63 additions & 0 deletions tests/unit/browsers/test_playwright_controller_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING

import pytest
from playwright.async_api import Browser, Playwright, async_playwright

from crawlee.browsers import PlaywrightBrowserController

if TYPE_CHECKING:
from collections.abc import AsyncGenerator


@pytest.fixture
async def playwright() -> AsyncGenerator[Playwright, None]:
async with async_playwright() as playwright:
yield playwright


@pytest.fixture
async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]:
browser = await playwright.chromium.launch()
yield browser
await browser.close()


async def test_controller_validation_typo_passed_through(browser: Browser) -> None:
"""Invalid options (e.g. typos) are passed through so Playwright raises its own error."""
controller = PlaywrightBrowserController(browser)
with pytest.raises(TypeError):
await controller.new_page(browser_new_context_options={'headles': True})
await controller.close()


async def test_controller_validation_cross_mode_persistent(browser: Browser, caplog: pytest.LogCaptureFixture) -> None:
# Default is persistent mode (use_incognito_pages=False)
controller = PlaywrightBrowserController(browser, use_incognito_pages=False)
# storage_state is incognito-only
with caplog.at_level(logging.WARNING):
page = await controller.new_page(browser_new_context_options={'storage_state': {'cookies': [], 'origins': []}})
assert 'Option "storage_state" is only supported in incognito context mode' in caplog.text
await page.close()
await controller.close()


async def test_controller_validation_cross_mode_incognito(browser: Browser, caplog: pytest.LogCaptureFixture) -> None:
controller = PlaywrightBrowserController(browser, use_incognito_pages=True)
# env is persistent-only
with caplog.at_level(logging.WARNING):
page = await controller.new_page(browser_new_context_options={'env': {}})
assert 'Option "env" is only supported in persistent context mode' in caplog.text
await page.close()
await controller.close()


async def test_controller_validation_valid_common(browser: Browser) -> None:
controller = PlaywrightBrowserController(browser)
# viewport is common
page = await controller.new_page(browser_new_context_options={'viewport': {'width': 800, 'height': 600}})
assert page.viewport_size == {'width': 800, 'height': 600}
await page.close()
await controller.close()
Loading