Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@ keywords = [
]
dependencies = [
"async-timeout>=5.0.1",
"browserforge>=1.2.4",
"cachetools>=5.5.0",
"colorama>=0.4.0",
"impit>=0.8.0",
"more-itertools>=10.2.0",
"playwright>=1.58.0",
"protego>=0.5.0",
"psutil>=6.0.0",
"pydantic-settings>=2.12.0",
Expand All @@ -55,15 +57,15 @@ adaptive-crawler = [
"jaro-winkler>=2.0.3",
"playwright>=1.27.0",
"scikit-learn>=1.6.0",
"apify_fingerprint_datapoints>=0.0.3",
"apify_fingerprint_datapoints>=0.11.0",
"browserforge>=1.2.4"
]
beautifulsoup = ["beautifulsoup4[lxml]>=4.12.0", "html5lib>=1.0"]
cli = ["cookiecutter>=2.6.0", "inquirer>=3.3.0", "rich>=13.9.0", "typer>=0.12.0"]
curl-impersonate = ["curl-cffi>=0.9.0"]
httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
httpx = ["httpx[brotli,http2,zstd]>=0.27.0", "apify_fingerprint_datapoints>=0.11.0", "browserforge>=1.2.3"]
parsel = ["parsel>=1.10.0"]
playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.0.2", "browserforge>=1.2.3"]
playwright = ["playwright>=1.27.0", "apify_fingerprint_datapoints>=0.11.0", "browserforge>=1.2.3"]
otel = [
"opentelemetry-api>=1.34.1",
"opentelemetry-distro[otlp]>=0.54",
Expand Down
47 changes: 40 additions & 7 deletions src/crawlee/browsers/_playwright_browser_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

from __future__ import annotations

import inspect
from asyncio import Lock
from datetime import datetime, timedelta, timezone
from typing import TYPE_CHECKING, Any, cast

from browserforge.injectors.playwright import AsyncNewContext
from playwright.async_api import Browser, BrowserContext, Page, ProxySettings
from playwright.async_api import BrowserType as PlaywrightBrowserType
from typing_extensions import override

from crawlee._utils.docs import docs_group
Expand All @@ -27,6 +29,14 @@

logger = getLogger(__name__)

# Cache Playwright signatures to avoid overhead in critical path
_launch_persistent_context_params = set(inspect.signature(PlaywrightBrowserType.launch_persistent_context).parameters)
_new_context_params = set(inspect.signature(Browser.new_context).parameters)

_common_context_options = _launch_persistent_context_params & _new_context_params
_persistent_unique_context_options = _launch_persistent_context_params - _new_context_params
_incognito_unique_context_options = _new_context_params - _launch_persistent_context_params


@docs_group('Browser management')
class PlaywrightBrowserController(BrowserController):
Expand Down Expand Up @@ -222,11 +232,36 @@ async def _create_browser_context(
`self._fingerprint_generator` is available.
"""
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}

filtered_options = {}
for key, value in browser_new_context_options.items():
if self._use_incognito_pages:
# Incognito mode (new_context)
if key in _common_context_options or key in _incognito_unique_context_options:
filtered_options[key] = value
elif key in _persistent_unique_context_options:
logger.warning(
f'Option "{key}" is only supported in persistent context mode '
'(use_incognito_pages=False) and will be ignored.'
)
else:
raise TypeError(f'"{key}" is not a valid Playwright context option.')
elif key in _common_context_options or key in _persistent_unique_context_options:
# Persistent mode (launch_persistent_context)
filtered_options[key] = value
elif key in _incognito_unique_context_options:
logger.warning(
f'Option "{key}" is only supported in incognito context mode '
'(use_incognito_pages=True) and will be ignored.'
)
else:
raise TypeError(f'"{key}" is not a valid Playwright context option.')

if proxy_info:
if browser_new_context_options.get('proxy'):
if filtered_options.get('proxy'):
logger.warning("browser_new_context_options['proxy'] overridden by explicit `proxy_info` argument.")

browser_new_context_options['proxy'] = ProxySettings(
filtered_options['proxy'] = ProxySettings(
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
username=proxy_info.username,
password=proxy_info.password,
Expand All @@ -236,7 +271,7 @@ async def _create_browser_context(
return await AsyncNewContext(
browser=self._browser,
fingerprint=self._fingerprint_generator.generate(),
**browser_new_context_options,
**filtered_options,
)

if self._header_generator:
Expand All @@ -256,7 +291,5 @@ async def _create_browser_context(
else:
extra_http_headers = None

browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
'extra_http_headers', extra_http_headers
)
return await self._browser.new_context(**browser_new_context_options)
filtered_options['extra_http_headers'] = filtered_options.get('extra_http_headers', extra_http_headers)
return await self._browser.new_context(**filtered_options)
62 changes: 62 additions & 0 deletions tests/unit/browsers/test_playwright_controller_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING

import pytest
from playwright.async_api import Browser, Playwright, async_playwright

from crawlee.browsers import PlaywrightBrowserController

if TYPE_CHECKING:
from collections.abc import AsyncGenerator


@pytest.fixture
async def playwright() -> AsyncGenerator[Playwright, None]:
async with async_playwright() as playwright:
yield playwright


@pytest.fixture
async def browser(playwright: Playwright) -> AsyncGenerator[Browser, None]:
browser = await playwright.chromium.launch()
yield browser
await browser.close()


async def test_controller_validation_typo(browser: Browser) -> None:
controller = PlaywrightBrowserController(browser)
with pytest.raises(TypeError, match=r'"headles" is not a valid Playwright context option.'):
await controller.new_page(browser_new_context_options={'headles': True})
await controller.close()


async def test_controller_validation_cross_mode_persistent(browser: Browser, caplog: pytest.LogCaptureFixture) -> None:
# Default is persistent mode (use_incognito_pages=False)
controller = PlaywrightBrowserController(browser, use_incognito_pages=False)
# storage_state is incognito-only
with caplog.at_level(logging.WARNING):
page = await controller.new_page(browser_new_context_options={'storage_state': {'cookies': [], 'origins': []}})
assert 'Option "storage_state" is only supported in incognito context mode' in caplog.text
await page.close()
await controller.close()


async def test_controller_validation_cross_mode_incognito(browser: Browser, caplog: pytest.LogCaptureFixture) -> None:
controller = PlaywrightBrowserController(browser, use_incognito_pages=True)
# env is persistent-only
with caplog.at_level(logging.WARNING):
page = await controller.new_page(browser_new_context_options={'env': {}})
assert 'Option "env" is only supported in persistent context mode' in caplog.text
await page.close()
await controller.close()


async def test_controller_validation_valid_common(browser: Browser) -> None:
controller = PlaywrightBrowserController(browser)
# viewport is common
page = await controller.new_page(browser_new_context_options={'viewport': {'width': 800, 'height': 600}})
assert page.viewport_size == {'width': 800, 'height': 600}
await page.close()
await controller.close()
10 changes: 7 additions & 3 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.