Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
49d8db1
core implementation
Mantisus Sep 13, 2025
fe3eee1
add fakeredis
Mantisus Sep 13, 2025
97ca5ea
Merge branch 'master' into redis
Mantisus Sep 13, 2025
997fca3
add support for NDU storages
Mantisus Sep 13, 2025
3c1aeed
clean code
Mantisus Sep 13, 2025
75d81d8
up docs
Mantisus Sep 14, 2025
31a1fa9
update guide
Mantisus Sep 14, 2025
d46ffbe
add in built-id
Mantisus Sep 14, 2025
5b77ab6
add tests for Redis clients
Mantisus Sep 15, 2025
ccf713c
resolve
Mantisus Sep 15, 2025
7c84ed1
suppress warnings
Mantisus Sep 15, 2025
a1f4403
resolve
Mantisus Sep 16, 2025
122c923
add default dedup strategy
Mantisus Sep 28, 2025
32cfe63
resolve
Mantisus Sep 28, 2025
ec34386
up tests
Mantisus Sep 28, 2025
c1dda54
up docs
Mantisus Sep 28, 2025
ad1d055
resolve
Mantisus Oct 20, 2025
c5d0941
save request state with reclaim
Mantisus Oct 20, 2025
a4a8a5b
Update src/crawlee/storage_clients/_redis/_storage_client.py
Mantisus Oct 22, 2025
f75d110
up first part
Mantisus Oct 22, 2025
3b4f18d
up second part
Mantisus Oct 22, 2025
f93215a
resolve
Mantisus Oct 24, 2025
5067613
up redis to 7.0.0
Mantisus Oct 24, 2025
9373e55
Update docs/guides/storage_clients.mdx
Mantisus Oct 30, 2025
0b4e2eb
Update docs/guides/storage_clients.mdx
Mantisus Oct 30, 2025
67b6ab1
Update pyproject.toml
Mantisus Oct 30, 2025
c3f07a5
Update src/crawlee/storage_clients/_redis/_storage_client.py
Mantisus Oct 30, 2025
8975c0f
up docs and tests
Mantisus Oct 30, 2025
7a2bebe
resolve
Mantisus Nov 10, 2025
34ec87d
Update pyproject.toml
vdusek Nov 11, 2025
7a3304a
Update uv.lock
vdusek Nov 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import RedisStorageClient

# Create a new instance of storage client using connection string.
# 'redis://localhost:6379' is the just placeholder, replace it with your actual
# connection string.
storage_client = RedisStorageClient(connection_string='redis://localhost:6379')

# And pass it to the crawler.
crawler = ParselCrawler(storage_client=storage_client)
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from redis.asyncio import Redis

from crawlee.configuration import Configuration
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import RedisStorageClient

# Create a new instance of storage client using a Redis client with custom settings.
# Replace host and port with your actual Redis server configuration.
# Other Redis client settings can be adjusted as needed.
storage_client = RedisStorageClient(
redis=Redis(
host='localhost',
port=6379,
retry_on_timeout=True,
socket_keepalive=True,
socket_connect_timeout=10,
)
)

# Create a configuration with custom settings.
configuration = Configuration(purge_on_start=False)

# And pass them to the crawler.
crawler = ParselCrawler(
storage_client=storage_client,
configuration=configuration,
)
178 changes: 175 additions & 3 deletions docs/guides/storage_clients.mdx
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should mention that Redis persistence is unlike that of filesystem or SQL storage and link to https://redis.io/docs/latest/operate/oss_and_stack/management/persistence/

Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ import CustomStorageClientExample from '!!raw-loader!roa-loader!./code_examples/
import RegisteringStorageClientsExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/registering_storage_clients_example.py';
import SQLStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/sql_storage_client_basic_example.py';
import SQLStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/sql_storage_client_configuration_example.py';
import RedisStorageClientBasicExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_basic_example.py';
import RedisStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_configuration_example.py';

Storage clients provide a unified interface for interacting with <ApiLink to="class/Dataset">`Dataset`</ApiLink>, <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, and <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, regardless of the underlying implementation. They handle operations like creating, reading, updating, and deleting storage instances, as well as managing data persistence and cleanup. This abstraction makes it easy to switch between different environments, such as local development and cloud production setups.

Expand All @@ -26,7 +28,8 @@ Crawlee provides three main storage client implementations:

- <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink> - Provides persistent file system storage with in-memory caching.
- <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> - Stores data in memory with no persistence.
- <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> – Provides persistent storage using a SQL database ([SQLite](https://sqlite.org/) or [PostgreSQL](https://www.postgresql.org/)). Requires installing the extra dependency: 'crawlee[sql_sqlite]' for SQLite or 'crawlee[sql_postgres]' for PostgreSQL.
- <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> - Provides persistent storage using a SQL database ([SQLite](https://sqlite.org/) or [PostgreSQL](https://www.postgresql.org/)). Requires installing the extra dependency: `crawlee[sql_sqlite]` for SQLite or `crawlee[sql_postgres]` for PostgreSQL.
- <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> - Provides persistent storage using a [Redis](https://redis.io/) database v8.0+. Requires installing the extra dependency `crawlee[redis]`.
- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python).

```mermaid
Expand Down Expand Up @@ -56,6 +59,8 @@ class MemoryStorageClient

class SqlStorageClient

class RedisStorageClient

class ApifyStorageClient

%% ========================
Expand All @@ -65,6 +70,7 @@ class ApifyStorageClient
StorageClient --|> FileSystemStorageClient
StorageClient --|> MemoryStorageClient
StorageClient --|> SqlStorageClient
StorageClient --|> RedisStorageClient
StorageClient --|> ApifyStorageClient
```

Expand Down Expand Up @@ -304,15 +310,181 @@ Configuration options for the <ApiLink to="class/SqlStorageClient">`SqlStorageCl

Configuration options for the <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> can be set via constructor arguments:

- **`connection_string`** (default: SQLite in <ApiLink to="class/Configuration">`Configuration`</ApiLink> storage dir) SQLAlchemy connection string, e.g. `sqlite+aiosqlite:///my.db` or `postgresql+asyncpg://user:pass@host/db`.
- **`engine`** Pre-configured SQLAlchemy AsyncEngine (optional).
- **`connection_string`** (default: SQLite in <ApiLink to="class/Configuration">`Configuration`</ApiLink> storage dir) - SQLAlchemy connection string, e.g. `sqlite+aiosqlite:///my.db` or `postgresql+asyncpg://user:pass@host/db`.
- **`engine`** - Pre-configured SQLAlchemy AsyncEngine (optional).

For advanced scenarios, you can configure <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> with a custom SQLAlchemy engine and additional options via the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class. This is useful, for example, when connecting to an external PostgreSQL database or customizing connection pooling.

<CodeBlock className="language-python" language="python">
{SQLStorageClientConfigurationExample}
</CodeBlock>

### Redis storage client

:::warning Experimental feature
The <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> is experimental. Its API and behavior may change in future releases.
:::

The <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> provides persistent storage using [Redis](https://redis.io/) database. It supports concurrent access from multiple independent clients or processes and uses Redis native data structures for efficient operations.

:::note dependencies
The <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> is not included in the core Crawlee package.
To use it, you need to install Crawlee with the Redis extra dependency:

<code>pip install 'crawlee[redis]'</code>

Additionally, Redis version 8.0 or higher is required.
:::

:::note Redis persistence
Data persistence in Redis depends on your [database configuration](https://redis.io/docs/latest/operate/oss_and_stack/management/persistence/).
:::

The client requires either a Redis connection string or a pre-configured Redis client instance. Use a pre-configured client when you need custom Redis settings such as connection pooling, timeouts, or SSL/TLS encryption.

<CodeBlock className="language-python" language="python">
{RedisStorageClientBasicExample}
</CodeBlock>

Data is organized using Redis key patterns. Below are the main data structures used for each storage type:

```mermaid
---
config:
class:
hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Client
%% ========================

class RedisDatasetClient {
<<Dataset>>
}

%% ========================
%% Dataset Keys
%% ========================

class DatasetKeys {
datasets:[name]:items - JSON Array
datasets:[name]:metadata - JSON Object
}

class DatasetsIndexes {
datasets:id_to_name - Hash
datasets:name_to_id - Hash
}

%% ========================
%% Client to Keys arrows
%% ========================

RedisDatasetClient --> DatasetKeys
RedisDatasetClient --> DatasetsIndexes
```

```mermaid
---
config:
class:
hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Clients
%% ========================

class RedisKeyValueStoreClient {
<<Key-value store>>
}

%% ========================
%% Key-Value Store Keys
%% ========================

class KeyValueStoreKeys {
key_value_stores:[name]:items - Hash
key_value_stores:[name]:metadata_items - Hash
key_value_stores:[name]:metadata - JSON Object
}

class KeyValueStoresIndexes {
key_value_stores:id_to_name - Hash
key_value_stores:name_to_id - Hash
}

%% ========================
%% Client to Keys arrows
%% ========================

RedisKeyValueStoreClient --> KeyValueStoreKeys
RedisKeyValueStoreClient --> KeyValueStoresIndexes
```

```mermaid
---
config:
class:
hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Clients
%% ========================

class RedisRequestQueueClient {
<<Request queue>>
}

%% ========================
%% Request Queue Keys
%% ========================

class RequestQueueKeys{
request_queues:[name]:queue - List
request_queues:[name]:data - Hash
request_queues:[name]:in_progress - Hash
request_queues:[name]:added_bloom_filter - Bloom Filter | bloom queue_dedup_strategy
request_queues:[name]:handled_bloom_filter - Bloom Filter | bloom queue_dedup_strategy
request_queues:[name]:pending_set - Set | default queue_dedup_strategy
request_queues:[name]:handled_set - Set | default queue_dedup_strategy
request_queues:[name]:metadata - JSON Object
}

class RequestQueuesIndexes {
request_queues:id_to_name - Hash
request_queues:name_to_id - Hash
}

%% ========================
%% Client to Keys arrows
%% ========================

RedisRequestQueueClient --> RequestQueueKeys
RedisRequestQueueClient --> RequestQueuesIndexes
```

Configuration options for the <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> can be set through environment variables or the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class:

- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start.

Configuration options for the <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> can be set via constructor arguments:

- **`connection_string`** - Redis connection string, e.g. `redis://localhost:6379/0`.
- **`redis`** - Pre-configured Redis client instance (optional).

<CodeBlock className="language-python" language="python">
{RedisStorageClientConfigurationExample}
</CodeBlock>

## Creating a custom storage client

A storage client consists of two parts: the storage client factory and individual storage type clients. The <ApiLink to="class/StorageClient">`StorageClient`</ApiLink> acts as a factory that creates specific clients (<ApiLink to="class/DatasetClient">`DatasetClient`</ApiLink>, <ApiLink to="class/KeyValueStoreClient">`KeyValueStoreClient`</ApiLink>, <ApiLink to="class/RequestQueueClient">`RequestQueueClient`</ApiLink>) where the actual storage logic is implemented.
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ dependencies = [
]

[project.optional-dependencies]
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres]"]
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres,redis]"]
adaptive-crawler = [
"jaro-winkler>=2.0.3",
"playwright>=1.27.0",
Expand Down Expand Up @@ -79,6 +79,7 @@ sql_sqlite = [
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
"aiosqlite>=0.21.0",
]
redis = ["redis[hiredis] >= 7.0.0"]

[project.scripts]
crawlee = "crawlee._cli:cli"
Expand All @@ -98,6 +99,7 @@ dev = [
"apify_client", # For e2e tests.
"build<2.0.0", # For e2e tests.
"dycw-pytest-only<3.0.0",
"fakeredis[probabilistic,json,lua]<3.0.0",
"mypy~=1.18.0",
"pre-commit<5.0.0",
"proxy-py<3.0.0",
Expand Down
4 changes: 4 additions & 0 deletions src/crawlee/storage_clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,13 @@
with _try_import(__name__, 'SqlStorageClient'):
from ._sql import SqlStorageClient

with _try_import(__name__, 'RedisStorageClient'):
from ._redis import RedisStorageClient

__all__ = [
'FileSystemStorageClient',
'MemoryStorageClient',
'RedisStorageClient',
'SqlStorageClient',
'StorageClient',
]
6 changes: 6 additions & 0 deletions src/crawlee/storage_clients/_redis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from ._dataset_client import RedisDatasetClient
from ._key_value_store_client import RedisKeyValueStoreClient
from ._request_queue_client import RedisRequestQueueClient
from ._storage_client import RedisStorageClient

__all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']
Loading
Loading