diff --git a/docs/examples/code_examples/using_browser_profiles_chrome.py b/docs/examples/code_examples/using_browser_profiles_chrome.py index 55cd4d685b..6831a9b41d 100644 --- a/docs/examples/code_examples/using_browser_profiles_chrome.py +++ b/docs/examples/code_examples/using_browser_profiles_chrome.py @@ -27,15 +27,13 @@ async def main() -> None: crawler = PlaywrightCrawler( headless=False, - # Use chromium for Chrome compatibility - browser_type='chromium', + # Use the installed Chrome browser + browser_type='chrome', # Disable fingerprints to preserve profile identity fingerprint_generator=None, # Set user data directory to temp folder user_data_dir=tmp_profile_dir, browser_launch_options={ - # Use installed Chrome browser - 'channel': 'chrome', # Slow down actions to mimic human behavior 'slow_mo': 200, 'args': [ diff --git a/docs/examples/using_browser_profile.mdx b/docs/examples/using_browser_profile.mdx index a991a8012f..8eda2554a4 100644 --- a/docs/examples/using_browser_profile.mdx +++ b/docs/examples/using_browser_profile.mdx @@ -18,8 +18,6 @@ Using browser profiles allows you to leverage existing login sessions, saved pas To run `PlaywrightCrawler` with your Chrome profile, you need to know the path to your profile files. You can find this information by entering `chrome://version/` as a URL in your Chrome browser. If you have multiple profiles, pay attention to the profile name - if you only have one profile, it's always `Default`. -You also need to use the [`channel`](https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch-option-channel) parameter in `browser_launch_options` to use the Chrome browser installed on your system instead of Playwright's Chromium. - :::warning Profile access limitation Due to [Chrome's security policies](https://developer.chrome.com/blog/remote-debugging-port), automation cannot use your main browsing profile directly. The example copies your profile to a temporary location as a workaround. ::: diff --git a/src/crawlee/browsers/_browser_pool.py b/src/crawlee/browsers/_browser_pool.py index 8e288605db..7d3fe0409c 100644 --- a/src/crawlee/browsers/_browser_pool.py +++ b/src/crawlee/browsers/_browser_pool.py @@ -118,7 +118,10 @@ def with_default_plugin( """Initialize a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options. Args: - browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit'). + browser_type: The type of browser to launch: + - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers + - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on + the system. user_data_dir: Path to a user data directory, which stores browser session data like cookies and local storage. browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided diff --git a/src/crawlee/browsers/_playwright_browser_plugin.py b/src/crawlee/browsers/_playwright_browser_plugin.py index 1d047a1bc2..fe9eb09e6e 100644 --- a/src/crawlee/browsers/_playwright_browser_plugin.py +++ b/src/crawlee/browsers/_playwright_browser_plugin.py @@ -34,8 +34,8 @@ class PlaywrightBrowserPlugin(BrowserPlugin): It is a plugin designed to manage browser instances using the Playwright automation library. It acts as a factory for creating new browser instances and provides a unified interface for interacting with different browser types - (chromium, firefox, and webkit). This class integrates configuration options for browser launches (headless mode, - executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each + (chromium, firefox, webkit and chrome). This class integrates configuration options for browser launches (headless + mode, executable paths, sandboxing, ...). It also manages browser contexts and the number of pages open within each browser instance, ensuring that resource limits are respected. """ @@ -55,7 +55,10 @@ def __init__( """Initialize a new instance. Args: - browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit'). + browser_type: The type of browser to launch: + - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers + - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on + the system. user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided @@ -80,6 +83,17 @@ def __init__( 'chromium_sandbox': not config.disable_browser_sandbox, } + if browser_type == 'chrome' and default_launch_browser_options['executable_path']: + raise ValueError( + 'Cannot use browser_type `chrome` with `Configuration.default_browser_path` or `executable_path` set.' + ) + + # Map 'chrome' to 'chromium' with the 'chrome' channel. + if browser_type == 'chrome': + browser_type = 'chromium' + # Chromium parameter 'channel' set to 'chrome' enables using installed Google Chrome. + default_launch_browser_options['channel'] = 'chrome' + self._browser_type: BrowserType = browser_type self._browser_launch_options: dict[str, Any] = default_launch_browser_options | (browser_launch_options or {}) self._browser_new_context_options = browser_new_context_options or {} diff --git a/src/crawlee/browsers/_types.py b/src/crawlee/browsers/_types.py index 40b9c87fb3..c5976b086a 100644 --- a/src/crawlee/browsers/_types.py +++ b/src/crawlee/browsers/_types.py @@ -6,7 +6,7 @@ if TYPE_CHECKING: from playwright.async_api import Page -BrowserType = Literal['chromium', 'firefox', 'webkit'] +BrowserType = Literal['chromium', 'firefox', 'webkit', 'chrome'] @dataclass diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index c32a9c9f27..7b728fc213 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -114,7 +114,10 @@ def __init__( browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages. user_data_dir: Path to a user data directory, which stores browser session data like cookies and local storage. - browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit'). + browser_type: The type of browser to launch: + - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers + - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on + the system. This option should not be used if `browser_pool` is provided. browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's `browser_type.launch` method. For more details, refer to the @@ -153,7 +156,7 @@ def __init__( ): raise ValueError( 'You cannot provide `headless`, `browser_type`, `browser_launch_options`, ' - '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or' + '`browser_new_context_options`, `use_incognito_pages`, `user_data_dir` or ' '`fingerprint_generator` arguments when `browser_pool` is provided.' ) @@ -494,7 +497,9 @@ class _PlaywrightCrawlerAdditionalOptions(TypedDict): """A `BrowserPool` instance to be used for launching the browsers and getting pages.""" browser_type: NotRequired[BrowserType] - """The type of browser to launch ('chromium', 'firefox', or 'webkit'). + """The type of browser to launch: + - 'chromium', 'firefox', 'webkit': Use Playwright-managed browsers + - 'chrome': Use your locally installed Google Chrome browser. Requires Google Chrome to be installed on the system. This option should not be used if `browser_pool` is provided.""" browser_launch_options: NotRequired[Mapping[str, Any]] diff --git a/src/crawlee/fingerprint_suite/_header_generator.py b/src/crawlee/fingerprint_suite/_header_generator.py index e5c6b71281..1c7111db57 100644 --- a/src/crawlee/fingerprint_suite/_header_generator.py +++ b/src/crawlee/fingerprint_suite/_header_generator.py @@ -11,9 +11,9 @@ def fingerprint_browser_type_from_playwright_browser_type( - playwright_browser_type: Literal['chromium', 'firefox', 'webkit'], + playwright_browser_type: Literal['chromium', 'firefox', 'webkit', 'chrome'], ) -> SupportedBrowserType: - if playwright_browser_type == 'chromium': + if playwright_browser_type in {'chromium', 'chrome'}: return 'chrome' if playwright_browser_type == 'firefox': return 'firefox' diff --git a/tests/unit/browsers/test_playwright_browser_plugin.py b/tests/unit/browsers/test_playwright_browser_plugin.py index d34e2b8d06..966ced01bf 100644 --- a/tests/unit/browsers/test_playwright_browser_plugin.py +++ b/tests/unit/browsers/test_playwright_browser_plugin.py @@ -69,3 +69,13 @@ async def test_methods_raise_error_when_not_active() -> None: async with plugin: assert plugin.active is True + + +async def raise_error_if_chrome_and_executable_path() -> None: + with pytest.raises( + ValueError, match=r'Cannot use `use_chrome` with `Configuration.default_browser_path` or `executable_path` set.' + ): + PlaywrightBrowserPlugin( + browser_type='chrome', + browser_launch_options={'executable_path': '/path/to/chrome'}, + )