From 181627d37f0df1289a7784e05eb7af79484b81cd Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sun, 7 Sep 2025 21:10:41 +0200 Subject: [PATCH 1/8] Add a Lakeflow-friendly default-python template, adapted to replace the lakeflow-pipelines template --- .../lakeflow-pipelines/python/output.txt | 10 +- .../.vscode/extensions.json | 1 - .../.vscode/settings.json | 30 +++- .../output/my_lakeflow_pipelines/README.md | 49 ++++--- .../my_lakeflow_pipelines/databricks.yml | 11 +- .../lib/shared/__init__.py | 0 .../my_lakeflow_pipelines/lib/shared/taxis.py | 7 + .../my_lakeflow_pipelines/out.gitignore | 2 + .../my_lakeflow_pipelines/resources/.gitkeep | 1 + .../README.md | 12 +- .../explorations/sample_exploration.ipynb | 2 +- .../lakeflow_pipelines_etl.pipeline.yml | 15 ++ .../lakeflow_pipelines_job.job.yml | 14 +- .../sample_trips_my_lakeflow_pipelines.py | 4 +- .../sample_zones_my_lakeflow_pipelines.py | 9 +- .../lakeflow_pipelines_etl/utilities/utils.py | 12 ++ .../my_lakeflow_pipelines.job.yml | 19 --- .../my_lakeflow_pipelines.pipeline.yml | 12 -- .../utilities/utils.py | 8 -- .../lakeflow-pipelines/sql/output.txt | 10 +- .../.vscode/extensions.json | 1 - .../.vscode/settings.json | 30 +++- .../output/my_lakeflow_pipelines/README.md | 49 ++++--- .../my_lakeflow_pipelines/databricks.yml | 11 +- .../lib/shared/__init__.py | 0 .../my_lakeflow_pipelines/lib/shared/taxis.py | 7 + .../my_lakeflow_pipelines/out.gitignore | 2 + .../my_lakeflow_pipelines/resources/.gitkeep | 1 + .../README.md | 15 +- .../explorations/sample_exploration.ipynb | 15 +- .../lakeflow_pipelines_etl.pipeline.yml | 15 ++ .../lakeflow_pipelines_job.job.yml} | 14 +- .../sample_trips_my_lakeflow_pipelines.sql | 3 +- .../sample_zones_my_lakeflow_pipelines.sql | 0 .../my_lakeflow_pipelines.pipeline.yml | 12 -- .../databricks_template_schema.json | 93 +++++++++---- .../databricks_template_schema.vnext.json | 78 +++++++++++ .../lakeflow-pipelines/library/variables.tmpl | 33 ----- .../lakeflow-pipelines/library/versions.tmpl | 17 +++ ...ow-default-python-as-pipelines-template.md | 1 + .../template/__preamble.tmpl | 59 +++++++- .../{.gitignore.tmpl => .gitignore} | 2 + .../{{.project_name}}/.vscode/extensions.json | 1 - .../{{.project_name}}/.vscode/settings.json | 39 ++++++ .../.vscode/settings.json.tmpl | 22 --- .../template/{{.project_name}}/README.md.tmpl | 80 ++++++++--- .../{{.project_name}}/databricks.yml.tmpl | 33 +++-- .../{{.project_name}}/fixtures/.gitkeep.tmpl | 15 ++ .../template/{{.project_name}}/lib/.gitkeep | 1 + .../{{.project_name}}/lib/shared/__init__.py | 0 .../lib/shared/taxis.py.tmpl | 7 + .../{{.project_name}}/pyproject.toml.tmpl | 32 +++++ .../{{.project_name}}/resources/.gitkeep | 1 + .../sample_job/sample_job.job.yml.tmpl | 98 +++++++++++++ .../sample_job/sample_notebook.ipynb.tmpl | 83 +++++++++++ .../sample_job/sample_python_file.py.tmpl | 19 +++ .../README.md.tmpl | 22 +++ .../sample_trips_{{.project_name}}.py.tmpl | 6 +- .../sample_trips_{{.project_name}}.sql.tmpl | 5 +- .../sample_zones_{{.project_name}}.py.tmpl | 9 +- .../sample_zones_{{.project_name}}.sql.tmpl | 4 +- .../utilities/utils.py.tmpl | 12 ++ ...project_name_short}}_etl.pipeline.yml.tmpl | 39 ++++++ .../{{.project_name_short}}_job.job.yml.tmpl | 25 ++++ .../{{.project_name}}_pipeline/README.md.tmpl | 48 ------- .../sample_exploration.ipynb.tmpl | 130 ------------------ .../utilities/utils.py | 8 -- .../{{.project_name}}.pipeline.yml.tmpl | 12 -- .../{{.project_name}}/tests/conftest.py | 93 +++++++++++++ .../tests/sample_taxis_test.py.tmpl | 8 ++ 70 files changed, 1058 insertions(+), 480 deletions(-) create mode 100644 acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/lib/shared/__init__.py create mode 100644 acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/lib/shared/taxis.py create mode 100644 acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/.gitkeep rename acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/{my_lakeflow_pipelines_pipeline => lakeflow_pipelines_etl}/README.md (67%) rename acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/{my_lakeflow_pipelines_pipeline => lakeflow_pipelines_etl}/explorations/sample_exploration.ipynb (93%) create mode 100644 acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml rename libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.job.yml.tmpl => acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_job.job.yml (52%) rename acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/{my_lakeflow_pipelines_pipeline => lakeflow_pipelines_etl}/transformations/sample_trips_my_lakeflow_pipelines.py (81%) rename acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/{my_lakeflow_pipelines_pipeline => lakeflow_pipelines_etl}/transformations/sample_zones_my_lakeflow_pipelines.py (57%) create mode 100644 acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/utilities/utils.py delete mode 100644 acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml delete mode 100644 acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml delete mode 100644 acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/utilities/utils.py create mode 100644 acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/lib/shared/__init__.py create mode 100644 acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/lib/shared/taxis.py create mode 100644 acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/.gitkeep rename acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/{my_lakeflow_pipelines_pipeline => lakeflow_pipelines_etl}/README.md (57%) rename acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/{my_lakeflow_pipelines_pipeline => lakeflow_pipelines_etl}/explorations/sample_exploration.ipynb (75%) create mode 100644 acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml rename acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/{my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml => lakeflow_pipelines_etl/lakeflow_pipelines_job.job.yml} (52%) rename acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/{my_lakeflow_pipelines_pipeline => lakeflow_pipelines_etl}/transformations/sample_trips_my_lakeflow_pipelines.sql (87%) rename acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/{my_lakeflow_pipelines_pipeline => lakeflow_pipelines_etl}/transformations/sample_zones_my_lakeflow_pipelines.sql (100%) delete mode 100644 acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml create mode 100644 libs/template/templates/lakeflow-pipelines/databricks_template_schema.vnext.json delete mode 100644 libs/template/templates/lakeflow-pipelines/library/variables.tmpl create mode 100644 libs/template/templates/lakeflow-pipelines/library/versions.tmpl create mode 100644 libs/template/templates/lakeflow-pipelines/pr-lakeflow-default-python-as-pipelines-template.md rename libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/{.gitignore.tmpl => .gitignore} (77%) create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json delete mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/fixtures/.gitkeep.tmpl create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/lib/.gitkeep create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/lib/shared/__init__.py create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/lib/shared/taxis.py.tmpl create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/pyproject.toml.tmpl create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/.gitkeep create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_job.job.yml.tmpl create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_notebook.ipynb.tmpl create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_python_file.py.tmpl create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/README.md.tmpl rename libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{{.project_name}}_pipeline => {{.project_name_short}}_etl}/transformations/sample_trips_{{.project_name}}.py.tmpl (71%) rename libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{{.project_name}}_pipeline => {{.project_name_short}}_etl}/transformations/sample_trips_{{.project_name}}.sql.tmpl (66%) rename libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{{.project_name}}_pipeline => {{.project_name_short}}_etl}/transformations/sample_zones_{{.project_name}}.py.tmpl (57%) rename libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{{.project_name}}_pipeline => {{.project_name_short}}_etl}/transformations/sample_zones_{{.project_name}}.sql.tmpl (68%) create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/utilities/utils.py.tmpl create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/{{.project_name_short}}_etl.pipeline.yml.tmpl create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/{{.project_name_short}}_job.job.yml.tmpl delete mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/README.md.tmpl delete mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/explorations/sample_exploration.ipynb.tmpl delete mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/utilities/utils.py delete mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.pipeline.yml.tmpl create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/conftest.py create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/sample_taxis_test.py.tmpl diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output.txt b/acceptance/bundle/templates/lakeflow-pipelines/python/output.txt index 954a2a8409..734e34eb60 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output.txt +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output.txt @@ -1,12 +1,16 @@ >>> [CLI] bundle init lakeflow-pipelines --config-file ./input.json --output-dir output - Welcome to the template for Lakeflow Declarative Pipelines! +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. + +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). -Your new project has been created in the 'my_lakeflow_pipelines' directory! +✨ Your new project has been created in the 'my_lakeflow_pipelines' directory! -Refer to the README.md file for "getting started" instructions! +Please refer to the README.md file for "getting started" instructions. >>> [CLI] bundle validate -t dev Name: my_lakeflow_pipelines diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/extensions.json b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/extensions.json index 5d15eba363..1f39c33087 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/extensions.json +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/extensions.json @@ -1,7 +1,6 @@ { "recommendations": [ "databricks.databricks", - "ms-python.vscode-pylance", "redhat.vscode-yaml" ] } diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json index 3e76d20bd8..d8468d7b60 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json @@ -1,21 +1,39 @@ { - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["resources/my_lakeflow_pipelines_pipeline"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, + "dist": true, }, + "files.associations": { + "**/.gitkeep": "markdown" + } + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter", + "editor.defaultFormatter": "ms-python.python", "editor.formatOnSave": true, }, } diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/README.md b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/README.md index 49d493b854..574b73b3b6 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/README.md +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/README.md @@ -2,38 +2,53 @@ The 'my_lakeflow_pipelines' project was generated by using the Lakeflow Pipelines template. -## Setup +* `lib/`: Python source code for this project. +* `lib/shared`: Shared source code across all jobs/pipelines/etc. +* `resources/lakeflow_pipelines_etl`: Pipeline code and assets for the lakeflow_pipelines_etl pipeline. +* `resources/`: Resource configurations (jobs, pipelines, etc.) -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +## Getting started -2. Authenticate to your Databricks workspace, if you have not done so already: - ``` - $ databricks auth login - ``` +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. -3. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from - https://www.databricks.com/blog/announcing-pycharm-integration-databricks. +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html -## Deploying resources +# Using this project using the CLI -1. To deploy a development copy of this project, type: +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` (Note that "dev" is the default target, so the `--target` parameter is optional here.) -2. Similarly, to deploy a production copy, type: - ``` - $ databricks bundle deploy --target prod - ``` + This deploys everything that's defined for this project. + For example, the default template would deploy a pipeline called + `[dev yourname] lakeflow_pipelines_etl` to your workspace. + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. -3. Use the "summary" comand to review everything that was deployed: +3. Similarly, to deploy a production copy, type: ``` - $ databricks bundle summary + $ databricks bundle deploy --target prod ``` + Note the default template has a includes a job that runs the pipeline every day + (defined in resources/lakeflow_pipelines_etl/lakeflow_pipelines_job.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). 4. To run a job or pipeline, use the "run" command: ``` diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/databricks.yml b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/databricks.yml index ded4a8470d..e370dcb1fe 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/databricks.yml +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/databricks.yml @@ -14,8 +14,6 @@ variables: description: The catalog to use schema: description: The schema to use - notifications: - description: The email addresses to use for failure notifications targets: dev: @@ -30,18 +28,15 @@ targets: variables: catalog: main schema: ${workspace.current_user.short_name} - notifications: [] - prod: mode: production workspace: host: [DATABRICKS_URL] # We explicitly deploy to /Workspace/Users/[USERNAME] to make sure we only have a single copy. root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: main + schema: prod permissions: - user_name: [USERNAME] level: CAN_MANAGE - variables: - catalog: main - schema: default - notifications: [[USERNAME]] diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/lib/shared/__init__.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/lib/shared/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/lib/shared/taxis.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/lib/shared/taxis.py new file mode 100644 index 0000000000..a7309cd4c5 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/lib/shared/taxis.py @@ -0,0 +1,7 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame + + +def find_all_taxis() -> DataFrame: + """Find all taxi data.""" + return spark.read.table("samples.nyctaxi.trips") diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/out.gitignore b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/out.gitignore index f6a3b5ff93..e566c51f74 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/out.gitignore +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/out.gitignore @@ -4,5 +4,7 @@ dist/ __pycache__/ *.egg-info .venv/ +scratch/** +!scratch/README.md **/explorations/** **/!explorations/README.md diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/.gitkeep b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/.gitkeep new file mode 100644 index 0000000000..3e09c14c18 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/.gitkeep @@ -0,0 +1 @@ +This folder is reserved for Databricks Asset Bundles resource definitions. diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/README.md b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/README.md similarity index 67% rename from acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/README.md rename to acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/README.md index 6caf95d48a..56f7aa2593 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/README.md +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/README.md @@ -1,11 +1,11 @@ -# my_lakeflow_pipelines_pipeline +# my_lakeflow_pipelines -This folder defines all source code for the my_lakeflow_pipelines_pipeline pipeline: +This folder defines all source code for the my_lakeflow_pipelines pipeline: -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `utilities` (optional): Utility functions and Python modules used in this pipeline. -- `data_sources` (optional): View definitions describing the source data for this pipeline. +- `explorations/`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations/`: All dataset definitions and transformations. +- `utilities/` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources/` (optional): View definitions describing the source data for this pipeline. ## Getting Started diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/explorations/sample_exploration.ipynb b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/explorations/sample_exploration.ipynb similarity index 93% rename from acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/explorations/sample_exploration.ipynb rename to acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/explorations/sample_exploration.ipynb index 0187c0c95f..31eabaf091 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/explorations/sample_exploration.ipynb +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/explorations/sample_exploration.ipynb @@ -37,7 +37,7 @@ "source": [ "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", "\n", - "display(spark.sql(\"SELECT * FROM main.[USERNAME].my_lakeflow_pipelines\"))" + "display(spark.sql(\"SELECT * FROM main.[USERNAME].sample_trips_my_lakeflow_pipelines\"))" ] } ], diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml new file mode 100644 index 0000000000..90779ed09c --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml @@ -0,0 +1,15 @@ +# The main pipeline for my_lakeflow_pipelines + +resources: + pipelines: + lakeflow_pipelines_etl: + name: lakeflow_pipelines_etl + ## Catalog is required for serverless compute + catalog: main + schema: ${var.schema} + serverless: true + root_path: "." + + libraries: + - glob: + include: transformations/** diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.job.yml.tmpl b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_job.job.yml similarity index 52% rename from libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.job.yml.tmpl rename to acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_job.job.yml index 1e7a7ca780..b1a9d39aad 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.job.yml.tmpl +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_job.job.yml @@ -1,8 +1,9 @@ -# The job that triggers {{template `pipeline_name` .}}. +# The job that triggers lakeflow_pipelines_etl. + resources: jobs: - {{template `job_name` .}}: - name: {{template `job_name` .}} + lakeflow_pipelines_job: + name: lakeflow_pipelines_job trigger: # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger @@ -10,10 +11,11 @@ resources: interval: 1 unit: DAYS - email_notifications: - on_failure: ${var.notifications} + #email_notifications: + # on_failure: + # - your_email@example.com tasks: - task_key: refresh_pipeline pipeline_task: - pipeline_id: ${resources.pipelines.{{template `pipeline_name` .}}.id} + pipeline_id: ${resources.pipelines.lakeflow_pipelines_etl.id} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_trips_my_lakeflow_pipelines.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.py similarity index 81% rename from acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_trips_my_lakeflow_pipelines.py rename to acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.py index f8355f62ae..6616f607c5 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_trips_my_lakeflow_pipelines.py +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.py @@ -10,4 +10,6 @@ @dlt.table def sample_trips_my_lakeflow_pipelines(): - return spark.read.table("samples.nyctaxi.trips").withColumn("trip_distance_km", utils.distance_km(col("trip_distance"))) + return spark.read.table("samples.nyctaxi.trips").withColumn( + "trip_distance_km", utils.distance_km(col("trip_distance")) + ) diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_zones_my_lakeflow_pipelines.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py similarity index 57% rename from acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_zones_my_lakeflow_pipelines.py rename to acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py index c9bee817b6..4ee08b5cd7 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_zones_my_lakeflow_pipelines.py +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py @@ -8,6 +8,11 @@ @dlt.table -def sample_zones_my_lakeflow_pipelines(): +def sample_zones_sample(): # Read from the "sample_trips" table, then sum all the fares - return spark.read.table("sample_trips_my_lakeflow_pipelines").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) + return ( + spark.read.table(f"sample_trips_my_lakeflow_pipelines") + .groupBy(col("pickup_zip")) + .agg(sum("fare_amount").alias("total_fare")) + ) + diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/utilities/utils.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/utilities/utils.py new file mode 100644 index 0000000000..f0f4e940f7 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/utilities/utils.py @@ -0,0 +1,12 @@ +from pyspark.sql.functions import col, when + + +def distance_km(distance_col): + """Convert distance from miles to kilometers.""" + return distance_col * 1.60934 + + +def format_currency(amount_col): + """Format amount as currency.""" + return when(col(amount_col).isNotNull(), + col(amount_col).cast("decimal(10,2)")) diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml deleted file mode 100644 index f07a973780..0000000000 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml +++ /dev/null @@ -1,19 +0,0 @@ -# The job that triggers my_lakeflow_pipelines_pipeline. -resources: - jobs: - my_lakeflow_pipelines_job: - name: my_lakeflow_pipelines_job - - trigger: - # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger - periodic: - interval: 1 - unit: DAYS - - email_notifications: - on_failure: ${var.notifications} - - tasks: - - task_key: refresh_pipeline - pipeline_task: - pipeline_id: ${resources.pipelines.my_lakeflow_pipelines_pipeline.id} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml deleted file mode 100644 index 499ddad0ca..0000000000 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml +++ /dev/null @@ -1,12 +0,0 @@ -resources: - pipelines: - my_lakeflow_pipelines_pipeline: - name: my_lakeflow_pipelines_pipeline - serverless: true - channel: "PREVIEW" - catalog: ${var.catalog} - schema: ${var.schema} - root_path: "." - libraries: - - glob: - include: transformations/** diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/utilities/utils.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/utilities/utils.py deleted file mode 100644 index ff039898f0..0000000000 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/utilities/utils.py +++ /dev/null @@ -1,8 +0,0 @@ -from pyspark.sql.functions import udf -from pyspark.sql.types import FloatType - - -@udf(returnType=FloatType()) -def distance_km(distance_miles): - """Convert distance from miles to kilometers (1 mile = 1.60934 km).""" - return distance_miles * 1.60934 diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output.txt b/acceptance/bundle/templates/lakeflow-pipelines/sql/output.txt index 954a2a8409..734e34eb60 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output.txt +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output.txt @@ -1,12 +1,16 @@ >>> [CLI] bundle init lakeflow-pipelines --config-file ./input.json --output-dir output - Welcome to the template for Lakeflow Declarative Pipelines! +Please answer the below to tailor your project to your preferences. +You can always change your mind and change your configuration in the databricks.yml file later. + +Note that [DATABRICKS_URL] is used for initialization +(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile). -Your new project has been created in the 'my_lakeflow_pipelines' directory! +✨ Your new project has been created in the 'my_lakeflow_pipelines' directory! -Refer to the README.md file for "getting started" instructions! +Please refer to the README.md file for "getting started" instructions. >>> [CLI] bundle validate -t dev Name: my_lakeflow_pipelines diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/extensions.json b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/extensions.json index 5d15eba363..1f39c33087 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/extensions.json +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/extensions.json @@ -1,7 +1,6 @@ { "recommendations": [ "databricks.databricks", - "ms-python.vscode-pylance", "redhat.vscode-yaml" ] } diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json index 3e76d20bd8..d8468d7b60 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json @@ -1,21 +1,39 @@ { - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", "python.testing.pytestArgs": [ "." ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - "python.analysis.extraPaths": ["resources/my_lakeflow_pipelines_pipeline"], "files.exclude": { "**/*.egg-info": true, "**/__pycache__": true, ".pytest_cache": true, + "dist": true, }, + "files.associations": { + "**/.gitkeep": "markdown" + } + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter", + "editor.defaultFormatter": "ms-python.python", "editor.formatOnSave": true, }, } diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/README.md b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/README.md index 49d493b854..574b73b3b6 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/README.md +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/README.md @@ -2,38 +2,53 @@ The 'my_lakeflow_pipelines' project was generated by using the Lakeflow Pipelines template. -## Setup +* `lib/`: Python source code for this project. +* `lib/shared`: Shared source code across all jobs/pipelines/etc. +* `resources/lakeflow_pipelines_etl`: Pipeline code and assets for the lakeflow_pipelines_etl pipeline. +* `resources/`: Resource configurations (jobs, pipelines, etc.) -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +## Getting started -2. Authenticate to your Databricks workspace, if you have not done so already: - ``` - $ databricks auth login - ``` +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. -3. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from - https://www.databricks.com/blog/announcing-pycharm-integration-databricks. +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html -## Deploying resources +# Using this project using the CLI -1. To deploy a development copy of this project, type: +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` (Note that "dev" is the default target, so the `--target` parameter is optional here.) -2. Similarly, to deploy a production copy, type: - ``` - $ databricks bundle deploy --target prod - ``` + This deploys everything that's defined for this project. + For example, the default template would deploy a pipeline called + `[dev yourname] lakeflow_pipelines_etl` to your workspace. + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. -3. Use the "summary" comand to review everything that was deployed: +3. Similarly, to deploy a production copy, type: ``` - $ databricks bundle summary + $ databricks bundle deploy --target prod ``` + Note the default template has a includes a job that runs the pipeline every day + (defined in resources/lakeflow_pipelines_etl/lakeflow_pipelines_job.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). 4. To run a job or pipeline, use the "run" command: ``` diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/databricks.yml b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/databricks.yml index ded4a8470d..e370dcb1fe 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/databricks.yml +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/databricks.yml @@ -14,8 +14,6 @@ variables: description: The catalog to use schema: description: The schema to use - notifications: - description: The email addresses to use for failure notifications targets: dev: @@ -30,18 +28,15 @@ targets: variables: catalog: main schema: ${workspace.current_user.short_name} - notifications: [] - prod: mode: production workspace: host: [DATABRICKS_URL] # We explicitly deploy to /Workspace/Users/[USERNAME] to make sure we only have a single copy. root_path: /Workspace/Users/[USERNAME]/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: main + schema: prod permissions: - user_name: [USERNAME] level: CAN_MANAGE - variables: - catalog: main - schema: default - notifications: [[USERNAME]] diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/lib/shared/__init__.py b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/lib/shared/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/lib/shared/taxis.py b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/lib/shared/taxis.py new file mode 100644 index 0000000000..a7309cd4c5 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/lib/shared/taxis.py @@ -0,0 +1,7 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame + + +def find_all_taxis() -> DataFrame: + """Find all taxi data.""" + return spark.read.table("samples.nyctaxi.trips") diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/out.gitignore b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/out.gitignore index f6a3b5ff93..e566c51f74 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/out.gitignore +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/out.gitignore @@ -4,5 +4,7 @@ dist/ __pycache__/ *.egg-info .venv/ +scratch/** +!scratch/README.md **/explorations/** **/!explorations/README.md diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/.gitkeep b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/.gitkeep new file mode 100644 index 0000000000..3e09c14c18 --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/.gitkeep @@ -0,0 +1 @@ +This folder is reserved for Databricks Asset Bundles resource definitions. diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/README.md b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/README.md similarity index 57% rename from acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/README.md rename to acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/README.md index d77802d23e..56f7aa2593 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/README.md +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/README.md @@ -1,18 +1,19 @@ -# my_lakeflow_pipelines_pipeline +# my_lakeflow_pipelines -This folder defines all source code for the 'my_lakeflow_pipelines_pipeline' pipeline: +This folder defines all source code for the my_lakeflow_pipelines pipeline: -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `data_sources` (optional): View definitions describing the source data for this pipeline. +- `explorations/`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations/`: All dataset definitions and transformations. +- `utilities/` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources/` (optional): View definitions describing the source data for this pipeline. ## Getting Started To get started, go to the `transformations` folder -- most of the relevant source code lives there: * By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_my_lakeflow_pipelines.sql" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/sql-ref.html. +* Take a look at the sample under "sample_trips_my_lakeflow_pipelines.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. * Use `Run file` to run and preview a single transformation. * Use `Run pipeline` to run _all_ transformations in the entire pipeline. * Use `+ Add` in the file browser to add a new data set definition. diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/explorations/sample_exploration.ipynb b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/explorations/sample_exploration.ipynb similarity index 75% rename from acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/explorations/sample_exploration.ipynb rename to acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/explorations/sample_exploration.ipynb index a3db8fdf08..31eabaf091 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/explorations/sample_exploration.ipynb +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/explorations/sample_exploration.ipynb @@ -35,12 +35,9 @@ }, "outputs": [], "source": [ - "-- !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", + "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", "\n", - "USE CATALOG `main`;\n", - "USE SCHEMA `[USERNAME]`;\n", - "\n", - "SELECT * from my_lakeflow_pipelines;" + "display(spark.sql(\"SELECT * FROM main.[USERNAME].sample_trips_my_lakeflow_pipelines\"))" ] } ], @@ -50,13 +47,15 @@ "dashboards": [], "environmentMetadata": null, "inputWidgetPreferences": null, - "language": "sql", - "notebookMetadata": {}, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, "notebookName": "sample_exploration", "widgets": {} }, "language_info": { - "name": "sql" + "name": "python" } }, "nbformat": 4, diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml new file mode 100644 index 0000000000..90779ed09c --- /dev/null +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml @@ -0,0 +1,15 @@ +# The main pipeline for my_lakeflow_pipelines + +resources: + pipelines: + lakeflow_pipelines_etl: + name: lakeflow_pipelines_etl + ## Catalog is required for serverless compute + catalog: main + schema: ${var.schema} + serverless: true + root_path: "." + + libraries: + - glob: + include: transformations/** diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_job.job.yml similarity index 52% rename from acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml rename to acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_job.job.yml index f07a973780..b1a9d39aad 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.job.yml +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_job.job.yml @@ -1,8 +1,9 @@ -# The job that triggers my_lakeflow_pipelines_pipeline. +# The job that triggers lakeflow_pipelines_etl. + resources: jobs: - my_lakeflow_pipelines_job: - name: my_lakeflow_pipelines_job + lakeflow_pipelines_job: + name: lakeflow_pipelines_job trigger: # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger @@ -10,10 +11,11 @@ resources: interval: 1 unit: DAYS - email_notifications: - on_failure: ${var.notifications} + #email_notifications: + # on_failure: + # - your_email@example.com tasks: - task_key: refresh_pipeline pipeline_task: - pipeline_id: ${resources.pipelines.my_lakeflow_pipelines_pipeline.id} + pipeline_id: ${resources.pipelines.lakeflow_pipelines_etl.id} diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_trips_my_lakeflow_pipelines.sql b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.sql similarity index 87% rename from acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_trips_my_lakeflow_pipelines.sql rename to acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.sql index 116bb5184b..064db57f8b 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_trips_my_lakeflow_pipelines.sql +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.sql @@ -5,5 +5,6 @@ CREATE MATERIALIZED VIEW sample_trips_my_lakeflow_pipelines AS SELECT pickup_zip, - fare_amount + fare_amount, + trip_distance FROM samples.nyctaxi.trips diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_zones_my_lakeflow_pipelines.sql b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.sql similarity index 100% rename from acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/transformations/sample_zones_my_lakeflow_pipelines.sql rename to acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.sql diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml deleted file mode 100644 index 499ddad0ca..0000000000 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/my_lakeflow_pipelines_pipeline/my_lakeflow_pipelines.pipeline.yml +++ /dev/null @@ -1,12 +0,0 @@ -resources: - pipelines: - my_lakeflow_pipelines_pipeline: - name: my_lakeflow_pipelines_pipeline - serverless: true - channel: "PREVIEW" - catalog: ${var.catalog} - schema: ${var.schema} - root_path: "." - libraries: - - glob: - include: transformations/** diff --git a/libs/template/templates/lakeflow-pipelines/databricks_template_schema.json b/libs/template/templates/lakeflow-pipelines/databricks_template_schema.json index 8fbc13c69f..f881f1743f 100644 --- a/libs/template/templates/lakeflow-pipelines/databricks_template_schema.json +++ b/libs/template/templates/lakeflow-pipelines/databricks_template_schema.json @@ -1,57 +1,96 @@ { - "welcome_message": "\nWelcome to the template for Lakeflow Declarative Pipelines!", + "//": "This template is based on an upcoming version of the default-python template, but comes with a SQL variation + certain defaults that are specific to Lakeflow Pipelines.", + "welcome_message": "Welcome to the template for Lakeflow Declarative Pipelines!\n\nPlease answer the below to tailor your project to your preferences.\nYou can always change your mind and change your configuration in the databricks.yml file later.\n\nNote that {{workspace_host}} is used for initialization\n(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile).", "properties": { + "lakeflow_only": { + "//": "This property is set by default adjust this template for Lakeflow Pipelines", + "skip_prompt_if": {}, + "default": "yes", + "type": "string", + "enum": ["yes", "no"], + "description": "Adjust template for Lakeflow Pipelines", + "order": 0 + }, "project_name": { "type": "string", - "default": "my_lakeflow_project", - "description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project\nproject_name", + "default": "lakeflow_project", + "description": "\nUnique name for this project", "order": 1, "pattern": "^[a-z0-9_]+$", - "pattern_match_failure_message": "Name must consist of lower case letters, numbers, and underscores." + "pattern_match_failure_message": "Name must consist of letters, numbers, and underscores." + }, + "project_name_short": { + "//": "This is a derived property based on project_name (it replaces my_project with sample and strips _project|_app|_service)", + "skip_prompt_if": {}, + "type": "string", + "default": "{{if or (eq .project_name \"lakeflow_project\") (eq .project_name \"my_project\")}}sample{{else}}{{with (regexp \"^(my_|lakeflow_)?(.*)(_project|_app|_service)?$\").FindStringSubmatch .project_name}}{{index . 2}}{{else}}{{.project_name}}{{end}}{{end}}", + "description": "Short name for the project", + "order": 2 + }, + "include_job": { + "//": "For the present template, the answer here is always 'no'", + "skip_prompt_if": {}, + "default": "no", + "type": "string", + "enum": ["yes", "no"], + "description": "Include a Lakeflow job that runs a notebook", + "order": 3 + }, + "include_pipeline": { + "//": "For the present template, the answer here is always 'yes'", + "skip_prompt_if": {}, + "default": "yes", + "type": "string", + "enum": ["yes", "no"], + "description": "Include a Lakeflow ETL pipeline", + "order": 4 + }, + "include_python": { + "//": "For the present template, the answer here is always 'no'", + "skip_prompt_if": {}, + "type": "string", + "default": "no", + "enum": ["yes", "no"], + "description": "Include a sample Python package that is built to a wheel file", + "order": 5 + }, + "serverless": { + "//": "For the present template, the answer here is always 'yes', since it can be easily changed", + "skip_prompt_if": {}, + "type": "string", + "default": "yes", + "enum": ["yes", "no"], + "description": "Use serverless compute?", + "order": 6 }, "default_catalog": { "type": "string", "default": "{{default_catalog}}", "pattern": "^\\w*$", "pattern_match_failure_message": "Invalid catalog name.", - "description": "\nInitial catalog:\ndefault_catalog", - "order": 3 + "description": "Default catalog for any tables created by this project{{if eq (default_catalog) \"\"}} (leave blank when not using Unity Catalog){{end}}", + "order": 7 }, "personal_schemas": { "type": "string", - "description": "\nUse a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas", + "description": "Use a personal schema for each user working on this project\n(this is recommended, your personal schema will be '{{.default_catalog}}.{{short_name}}')", "default": "yes", "enum": [ "yes", - "no" + "no (advanced: I will customize the schema configuration later in databricks.yml)" ], - "order": 4 - }, - "shared_schema": { - "skip_prompt_if": { - "properties": { - "personal_schemas": { - "const": "yes" - } - } - }, - "type": "string", - "default": "default", - "pattern": "^\\w+$", - "pattern_match_failure_message": "Invalid schema name.", - "description": "\nInitial schema during development:\ndefault_schema", - "order": 5 + "order": 8 }, "language": { "type": "string", "default": "python", - "description": "\nInitial language for this project:\nlanguage", + "description": "Initial language for this project", "enum": [ "python", "sql" ], - "order": 6 + "order": 9 } }, - "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nRefer to the README.md file for \"getting started\" instructions!" + "success_message": "\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions." } diff --git a/libs/template/templates/lakeflow-pipelines/databricks_template_schema.vnext.json b/libs/template/templates/lakeflow-pipelines/databricks_template_schema.vnext.json new file mode 100644 index 0000000000..2afdff62b4 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/databricks_template_schema.vnext.json @@ -0,0 +1,78 @@ +{ + "welcome_message": "Welcome to the default Python template for Databricks Asset Bundles!\n\nPlease answer the below to tailor your project to your preferences.\nYou can always change your mind and change your configuration in the databricks.yml file later.\n\nNote that {{workspace_host}} is used for initialization\n(see https://docs.databricks.com/dev-tools/cli/profiles.html for how to change your profile).", + "properties": { + "project_name": { + "type": "string", + "default": "my_project", + "description": "\nUnique name for this project", + "order": 1, + "pattern": "^[A-Za-z0-9_]+$", + "pattern_match_failure_message": "Name must consist of letters, numbers, and underscores." + }, + "project_name_short": { + "//": "This is a derived property based on project_name (it replaces my_project with sample and strips _project|_app|_service)", + "skip_prompt_if": {}, + "type": "string", + "default": "{{if eq .project_name \"my_project\"}}sample{{else}}{{with (regexp \"^(my_)?(.*)(_project|_app|_service)?$\").FindStringSubmatch .project_name}}{{index . 2}}{{else}}{{.project_name}}{{end}}{{end}}", + "description": "Short name for the project", + "order": 2 + }, + "include_job": { + "type": "string", + "default": "yes", + "enum": ["yes", "no"], + "description": "Include a Lakeflow job that runs a notebook", + "order": 3 + }, + "include_pipeline": { + "type": "string", + "default": "yes", + "enum": ["yes", "no"], + "description": "Include a Lakeflow ETL pipeline", + "order": 4 + }, + "include_python": { + "type": "string", + "default": "yes", + "enum": ["yes", "no"], + "description": "Include a sample Python package that is built to a wheel file", + "order": 5 + }, + "serverless": { + "type": "string", + "default": "yes", + "enum": ["yes", "no"], + "description": "Use serverless compute", + "order": 6 + }, + "default_catalog": { + "type": "string", + "default": "{{default_catalog}}", + "pattern": "^\\w*$", + "pattern_match_failure_message": "Invalid catalog name.", + "description": "Default catalog for any tables created by this project{{if eq (default_catalog) \"\"}} (leave blank when not using Unity Catalog){{end}}", + "order": 7 + }, + "personal_schemas": { + "type": "string", + "description": "Use a personal schema for each user working on this project\n(this is recommended, your personal schema will be '{{.default_catalog}}.{{short_name}}')", + "default": "yes", + "enum": [ + "yes", + "no (advanced: I will customize the schema configuration later in databricks.yml)" + ], + "order": 8 + }, + "language": { + "type": "string", + "default": "python", + "description": "Initial language for this project", + "enum": [ + "python", + "sql" + ], + "order": 9 + } + }, + "success_message": "\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html." +} diff --git a/libs/template/templates/lakeflow-pipelines/library/variables.tmpl b/libs/template/templates/lakeflow-pipelines/library/variables.tmpl deleted file mode 100644 index 9c5c36b449..0000000000 --- a/libs/template/templates/lakeflow-pipelines/library/variables.tmpl +++ /dev/null @@ -1,33 +0,0 @@ -{{- define `pipeline_name` -}} - {{ .project_name }}_pipeline -{{- end }} - -{{- define `job_name` -}} - {{ .project_name }}_job -{{- end }} - -{{- define `static_dev_schema` -}} - {{- if (regexp "^yes").MatchString .personal_schemas -}} - {{ short_name }} - {{- else -}} - {{ .shared_schema }} - {{- end}} -{{- end }} - - -{{- define `dev_schema` -}} - {{- if (regexp "^yes").MatchString .personal_schemas -}} - ${workspace.current_user.short_name} - {{- else -}} - {{ .shared_schema }} - {{- end}} -{{- end }} - - -{{- define `prod_schema` -}} - {{- if (regexp "^yes").MatchString .personal_schemas -}} - default - {{- else -}} - {{ .shared_schema }} - {{- end}} -{{- end }} diff --git a/libs/template/templates/lakeflow-pipelines/library/versions.tmpl b/libs/template/templates/lakeflow-pipelines/library/versions.tmpl new file mode 100644 index 0000000000..912dc9c9d7 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/library/versions.tmpl @@ -0,0 +1,17 @@ +{{/* The latest LTS DBR version; this should be updated a few months after each LTS. + */}} +{{define "latest_lts_dbr_version" -}} + 16.4.x-scala2.12 +{{- end}} + +{{/* A safe version of DB Connect that is compatible with at least half the + * clusters running in production. + * + * We need to be very conservative in updating this, since a newer version can + * only connect to compute of that same version and higher. If the version is + * deemed too old, customers can update the version themselves after initializing + * the template. + */}} +{{define "conservative_db_connect_version_spec" -}} + >=15.4,<15.5 +{{- end}} diff --git a/libs/template/templates/lakeflow-pipelines/pr-lakeflow-default-python-as-pipelines-template.md b/libs/template/templates/lakeflow-pipelines/pr-lakeflow-default-python-as-pipelines-template.md new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/pr-lakeflow-default-python-as-pipelines-template.md @@ -0,0 +1 @@ + diff --git a/libs/template/templates/lakeflow-pipelines/template/__preamble.tmpl b/libs/template/templates/lakeflow-pipelines/template/__preamble.tmpl index c6c0c2321f..6e456ed695 100644 --- a/libs/template/templates/lakeflow-pipelines/template/__preamble.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/__preamble.tmpl @@ -1,16 +1,61 @@ # Preamble -This file only contains template directives; it is skipped for the actual output. +This file only template directives; it is skipped for the actual output. {{skip "__preamble"}} +{{$pipeline := eq .include_pipeline "yes"}} +{{$job := eq .include_job "yes"}} +{{$python_package := eq .include_python "yes"}} {{$isSQL := eq .language "sql"}} +{{$lakeflow_only := eq .lakeflow_only "yes"}} -{{if $isSQL}} - {{skip "{{.project_name}}/resources/{{.project_name}}_pipeline/utilities/utils.py"}} - {{skip "{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.py"}} - {{skip "{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.py"}} +{{if not $python_package}} + {{skip "{{.project_name}}/pyproject.toml"}} +{{end}} + +{{if not $pipeline}} + {{skip "{{.project_name}}/resources/{{.project_name_short}}_etl.pipeline.yml"}} + {{skip "{{.project_name}}/resources/{{.project_name_short}}_etl"}} +{{end}} + +{{if not $job}} + {{skip "{{.project_name}}/resources/sample_job/sample_notebook.ipynb"}} + {{skip "{{.project_name}}/resources/sample_job/sample_python_file.py"}} + {{if not $pipeline}} + {{skip "{{.project_name}}/resources/sample_job"}} + {{end}} +{{end}} + +{{if and (not $pipeline) (not $job) (not $python_package)}} + {{skip "{{.project_name}}/lib/shared"}} +{{end}} + +# Remove tests when Python package is not included (no pytest dependencies) +{{if not $python_package}} + {{skip "{{.project_name}}/tests"}} + {{skip "{{.project_name}}/fixtures"}} +{{end}} + +# Remove .gitkeep files for a non-empty project +{{if or $python_package $job $pipeline}} + {{skip "{{.project_name}}/lib/.gitkeep"}} +{{end}} + +# Language-specific file selection +{{if $pipeline}} + {{if $isSQL}} + {{skip "{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/*.py"}} + {{skip "{{.project_name}}/resources/{{.project_name_short}}_etl/utilities"}} + {{else}} + {{skip "{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/*.sql"}} + {{end}} +{{end}} + +{{if $lakeflow_only}} + # Only include a simple job.yml direcly in resources/{{.project_name_short}}_etl + {{skip "{{.project_name}}/resources/sample_job/**"}} {{else}} - {{skip "{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.sql"}} - {{skip "{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.sql"}} + # Include a full job that might have more than one task + {{skip "{{.project_name}}/resources/{{.project_name_short}}_etl/*.job.yml"}} {{end}} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore similarity index 77% rename from libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore.tmpl rename to libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore index f6a3b5ff93..e566c51f74 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore @@ -4,5 +4,7 @@ dist/ __pycache__/ *.egg-info .venv/ +scratch/** +!scratch/README.md **/explorations/** **/!explorations/README.md diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/extensions.json b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/extensions.json index 5d15eba363..1f39c33087 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/extensions.json +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/extensions.json @@ -1,7 +1,6 @@ { "recommendations": [ "databricks.databricks", - "ms-python.vscode-pylance", "redhat.vscode-yaml" ] } diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json new file mode 100644 index 0000000000..d8468d7b60 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json @@ -0,0 +1,39 @@ +{ + "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", + "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", + "python.testing.pytestArgs": [ + "." + ], + "files.exclude": { + "**/*.egg-info": true, + "**/__pycache__": true, + ".pytest_cache": true, + "dist": true, + }, + "files.associations": { + "**/.gitkeep": "markdown" + } + + // Pylance settings (VS Code) + // Set typeCheckingMode to "basic" to enable type checking! + "python.analysis.typeCheckingMode": "off", + "python.analysis.extraPaths": ["src", "lib", "resources"], + "python.analysis.diagnosticMode": "workspace", + "python.analysis.stubPath": ".vscode", + + // Pyright settings (Cursor) + // Set typeCheckingMode to "basic" to enable type checking! + "cursorpyright.analysis.typeCheckingMode": "off", + "cursorpyright.analysis.extraPaths": ["src", "lib", "resources"], + "cursorpyright.analysis.diagnosticMode": "workspace", + "cursorpyright.analysis.stubPath": ".vscode", + + // General Python settings + "python.defaultInterpreterPath": "./.venv/bin/python", + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true, + "[python]": { + "editor.defaultFormatter": "ms-python.python", + "editor.formatOnSave": true, + }, +} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl deleted file mode 100644 index 6a87715ae2..0000000000 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json.tmpl +++ /dev/null @@ -1,22 +0,0 @@ -{ - "python.analysis.stubPath": ".vscode", - "databricks.python.envFile": "${workspaceFolder}/.env", - "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])", - "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------", - "python.testing.pytestArgs": [ - "." - ], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true, - {{- /* Unfortunately extraPaths doesn't support globs!! See: https://github.com/microsoft/pylance-release/issues/973 */}} - "python.analysis.extraPaths": ["resources/{{.project_name}}_pipeline"], - "files.exclude": { - "**/*.egg-info": true, - "**/__pycache__": true, - ".pytest_cache": true, - }, - "[python]": { - "editor.defaultFormatter": "ms-python.black-formatter", - "editor.formatOnSave": true, - }, -} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/README.md.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/README.md.tmpl index 837213a189..16a78c6242 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/README.md.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/README.md.tmpl @@ -2,40 +2,88 @@ The '{{.project_name}}' project was generated by using the Lakeflow Pipelines template. -## Setup +* `lib/`: Python source code for this project. +* `lib/shared`: Shared source code across all jobs/pipelines/etc. +* `resources/{{.project_name_short}}_etl`: Pipeline code and assets for the {{.project_name_short}}_etl pipeline. +* `resources/`: Resource configurations (jobs, pipelines, etc.) +{{- if (eq .include_python "yes")}} +* `tests/`: Unit tests. +* `fixtures/`: Fixtures for data sets (primarily used for testing). +{{end}} -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +## Getting started -2. Authenticate to your Databricks workspace, if you have not done so already: - ``` - $ databricks auth login - ``` +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/vscode-ext. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +{{- if or (eq .include_python "yes") (eq .include_job "yes")}} + +Dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. +{{end}} -3. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. Or the PyCharm plugin from - https://www.databricks.com/blog/announcing-pycharm-integration-databricks. +# Using this project using the CLI +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: -## Deploying resources +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` -1. To deploy a development copy of this project, type: +2. To deploy a development copy of this project, type: ``` $ databricks bundle deploy --target dev ``` (Note that "dev" is the default target, so the `--target` parameter is optional here.) -2. Similarly, to deploy a production copy, type: + This deploys everything that's defined for this project. + {{- if eq .lakeflow_only "yes"}} + For example, the default template would deploy a pipeline called + `[dev yourname] {{.project_name_short}}_etl` to your workspace. + {{- else}} + For example, the default template would deploy a job called + `[dev yourname] {{.project_name_short}}_job` to your workspace. + {{- end}} + You can find that resource by opening your workpace and clicking on **Jobs & Pipelines**. + +3. Similarly, to deploy a production copy, type: ``` $ databricks bundle deploy --target prod ``` -3. Use the "summary" comand to review everything that was deployed: - ``` - $ databricks bundle summary - ``` + {{- if eq .lakeflow_only "yes"}} + Note the default template has a includes a job that runs the pipeline every day + (defined in resources/{{.project_name_short}}_etl/{{.project_name_short}}_job.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). + {{- else}} + Note that the default job from the template has a schedule that runs every day + (defined in resources/{{.project_name_short}}_etl/{{.project_name_short}}_etl.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). + {{- end}} 4. To run a job or pipeline, use the "run" command: ``` $ databricks bundle run ``` + +{{- if (eq .include_python "yes")}} +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` +{{end}} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/databricks.yml.tmpl index 1108b20128..d321d52a11 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/databricks.yml.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/databricks.yml.tmpl @@ -1,3 +1,5 @@ +{{$with_classic := (ne .serverless "yes") -}} +{{$with_python := (eq .include_python "yes") -}} # This is a Databricks asset bundle definition for {{.project_name}}. # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: @@ -8,14 +10,26 @@ include: - resources/*.yml - resources/*/*.yml +{{- if $with_python}} + +artifacts: + python_artifact: + type: whl + build: uv build --wheel +{{- end}} + # Variable declarations. These variables are assigned in the dev/prod targets below. variables: catalog: description: The catalog to use schema: description: The schema to use - notifications: - description: The email addresses to use for failure notifications + +{{- $dev_schema := "dev" }} +{{- $prod_schema := "prod" }} +{{- if (regexp "^yes").MatchString .personal_schemas}} + {{- $dev_schema = "${workspace.current_user.short_name}"}} +{{- end}} targets: dev: @@ -29,19 +43,20 @@ targets: host: {{workspace_host}} variables: catalog: {{.default_catalog}} - schema: {{template `dev_schema` .}} - notifications: [] - + schema: {{$dev_schema}} + {{- if $with_classic}} + presets: + artifacts_dynamic_version: true + {{- end}} prod: mode: production workspace: host: {{workspace_host}} # We explicitly deploy to /Workspace/Users/{{user_name}} to make sure we only have a single copy. root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} + variables: + catalog: {{.default_catalog}} + schema: {{$prod_schema}} permissions: - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} level: CAN_MANAGE - variables: - catalog: {{.default_catalog}} - schema: {{template `prod_schema` .}} - notifications: [{{user_name}}] diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/fixtures/.gitkeep.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/fixtures/.gitkeep.tmpl new file mode 100644 index 0000000000..a84a182f1f --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/fixtures/.gitkeep.tmpl @@ -0,0 +1,15 @@ +# Test fixtures directory + +{{- /* +We don't want to have too many README.md files, since they +stand out so much. But we do need to have a file here to make +sure the folder is added to Git. +*/}} + +Add JSON or CSV files here. In tests, use them with `load_fixture()`: + +``` +def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert len(data) >= 1 +``` diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/lib/.gitkeep b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/lib/.gitkeep new file mode 100644 index 0000000000..0e0ed1e00b --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/lib/.gitkeep @@ -0,0 +1 @@ +This folder is reserved for Databricks Asset Bundles source files. diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/lib/shared/__init__.py b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/lib/shared/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/lib/shared/taxis.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/lib/shared/taxis.py.tmpl new file mode 100644 index 0000000000..a7309cd4c5 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/lib/shared/taxis.py.tmpl @@ -0,0 +1,7 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame + + +def find_all_taxis() -> DataFrame: + """Find all taxi data.""" + return spark.read.table("samples.nyctaxi.trips") diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/pyproject.toml.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/pyproject.toml.tmpl new file mode 100644 index 0000000000..8733be6999 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/pyproject.toml.tmpl @@ -0,0 +1,32 @@ +[project] +name = "{{.project_name}}" +version = "0.0.1" +authors = [{ name = "{{user_name}}" }] +requires-python = ">= 3.11" + +[dependency-groups] +dev = [ + "pytest", + "databricks-dlt", + + # databricks-connect can be used to run parts of this project locally. + # Note that for local development, you should use a version that is not newer + # than the remote cluster or serverless compute you connect to. + # See also https://docs.databricks.com/dev-tools/databricks-connect.html. + "databricks-connect{{template "conservative_db_connect_version_spec"}}", +] + +[tool.pytest.ini_options] +pythonpath = "lib" +testpaths = [ + "tests", + "resources", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["lib"] +sources = ["lib"] diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/.gitkeep b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/.gitkeep new file mode 100644 index 0000000000..3e09c14c18 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/.gitkeep @@ -0,0 +1 @@ +This folder is reserved for Databricks Asset Bundles resource definitions. diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_job.job.yml.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_job.job.yml.tmpl new file mode 100644 index 0000000000..c78c1a22ac --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_job.job.yml.tmpl @@ -0,0 +1,98 @@ +# A sample job for {{.project_name}}. + +{{- $serverless := (eq .serverless "yes")}} +{{- $python_package := (eq .include_python "yes")}} +{{- $notebook := (eq .include_job "yes")}} +{{- $pipeline := (eq .include_pipeline "yes")}} + +resources: + jobs: + sample_job: + name: sample_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + #email_notifications: + # on_failure: + # - your_email@example.com + + parameters: + - name: catalog + default: ${var.catalog} + - name: schema + default: ${var.schema} + + tasks: + +{{- if $notebook}} + - task_key: notebook_task + notebook_task: + notebook_path: sample_notebook.ipynb + {{- if $serverless}} + environment_key: default + {{- else}} + job_cluster_key: job_cluster + {{- if $python_package}} + libraries: + # By default we just include the .whl file generated for the {{.project_name_short}} package. + # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html + # for more information on how to add other libraries. + - whl: ../../dist/*.whl + {{- end}} + {{- end}} +{{- end}} + +{{- if $python_package}} + - task_key: python_file_task + depends_on: + - task_key: notebook_task + spark_python_task: + python_file: sample_python_file.py + {{- if $serverless}} + environment_key: default + {{- else}} + job_cluster_key: job_cluster + {{- if $python_package}} + libraries: + - whl: ../../dist/*.whl + {{- end}} + {{- end}} +{{- end}} + +{{- if $pipeline}} + - task_key: refresh_pipeline + depends_on: + - task_key: notebook_task + pipeline_task: + pipeline_id: ${resources.pipelines.{{.project_name_short}}_etl.id} +{{- end}} + +{{- if $serverless}} + + environments: + - environment_key: default + spec: + client: "2" + {{- if $python_package}} + dependencies: + # By default we just include the .whl file generated for the {{.project_name_short}} package. + # See https://docs.databricks.com/dev-tools/bundles/library-dependencies.html + # for more information on how to add other libraries. + - ../../dist/*.whl + {{- end}} +{{- else}} + + job_clusters: + - job_cluster_key: job_cluster + new_cluster: + spark_version: {{template "latest_lts_dbr_version"}} + node_type_id: {{smallest_node_type}} + data_security_mode: SINGLE_USER + autoscale: + min_workers: 1 + max_workers: 4 +{{- end}} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_notebook.ipynb.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_notebook.ipynb.tmpl new file mode 100644 index 0000000000..22a204435a --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_notebook.ipynb.tmpl @@ -0,0 +1,83 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Default notebook\n", + "\n", + "This default notebook is executed using a Lakeflow job as defined in resources/sample_job.job.yml." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Reload wheel file dependencies every time they are updated\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + {{- if (eq .include_python "yes") }} + "import sys\n", + "\n", + "sys.path.append(\"../src\")\n", + "from {{.project_name}} import main\n", + "\n", + "main.get_taxis().show(10)" + {{else}} + "spark.read.table(\"samples.nyctaxi.trips\")" + {{end -}} + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_python_file.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_python_file.py.tmpl new file mode 100644 index 0000000000..818837eecb --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_python_file.py.tmpl @@ -0,0 +1,19 @@ +import argparse +from datetime import datetime +from shared import taxis + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--catalog", default="{{.default_catalog}}") + parser.add_argument("--schema", default="default") + args = parser.parse_args() + + df = taxis.find_all_taxis() + + table_name = f"{args.catalog}.{args.schema}.taxis_{{.project_name}}" + df.write.mode("overwrite").saveAsTable(table_name) + + print(f"Wrote {df.count()} taxi records to {table_name}") + +if __name__ == "__main__": + main() diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/README.md.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/README.md.tmpl new file mode 100644 index 0000000000..b89974cda7 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/README.md.tmpl @@ -0,0 +1,22 @@ +# {{.project_name}} + +This folder defines all source code for the {{.project_name}} pipeline: + +- `explorations/`: Ad-hoc notebooks used to explore the data processed by this pipeline. +- `transformations/`: All dataset definitions and transformations. +- `utilities/` (optional): Utility functions and Python modules used in this pipeline. +- `data_sources/` (optional): View definitions describing the source data for this pipeline. + +## Getting Started + +To get started, go to the `transformations` folder -- most of the relevant source code lives there: + +* By convention, every dataset under `transformations` is in a separate file. +* Take a look at the sample under "sample_trips_{{.project_name}}.py" to get familiar with the syntax. + Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. +* Use `Run file` to run and preview a single transformation. +* Use `Run pipeline` to run _all_ transformations in the entire pipeline. +* Use `+ Add` in the file browser to add a new data set definition. +* Use `Schedule` to run the pipeline on a schedule! + +For more tutorials and reference material, see https://docs.databricks.com/dlt. diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.py.tmpl similarity index 71% rename from libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.py.tmpl rename to libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.py.tmpl index 963856d6b4..daa2a78517 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.py.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.py.tmpl @@ -9,5 +9,7 @@ from utilities import utils @dlt.table -def sample_trips_{{ .project_name }}(): - return spark.read.table("samples.nyctaxi.trips").withColumn("trip_distance_km", utils.distance_km(col("trip_distance"))) +def sample_trips_{{.project_name}}(): + return spark.read.table("samples.nyctaxi.trips").withColumn( + "trip_distance_km", utils.distance_km(col("trip_distance")) + ) diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.sql.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.sql.tmpl similarity index 66% rename from libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.sql.tmpl rename to libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.sql.tmpl index b95a95da4d..78e346063d 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_trips_{{.project_name}}.sql.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.sql.tmpl @@ -2,8 +2,9 @@ -- Edit the sample below or add new transformations -- using "+ Add" in the file browser. -CREATE MATERIALIZED VIEW sample_trips_{{ .project_name }} AS +CREATE MATERIALIZED VIEW sample_trips_{{.project_name}} AS SELECT pickup_zip, - fare_amount + fare_amount, + trip_distance FROM samples.nyctaxi.trips diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl similarity index 57% rename from libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.py.tmpl rename to libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl index 89a81121f8..0911a4c21d 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.py.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl @@ -8,6 +8,11 @@ from pyspark.sql.functions import col, sum @dlt.table -def sample_zones_{{ .project_name }}(): +def sample_zones_sample(): # Read from the "sample_trips" table, then sum all the fares - return spark.read.table("sample_trips_{{ .project_name }}").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) + return ( + spark.read.table(f"sample_trips_{{.project_name}}") + .groupBy(col("pickup_zip")) + .agg(sum("fare_amount").alias("total_fare")) + ) + diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.sql.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.sql.tmpl similarity index 68% rename from libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.sql.tmpl rename to libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.sql.tmpl index ab84f4066a..d0ec172129 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/transformations/sample_zones_{{.project_name}}.sql.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.sql.tmpl @@ -2,9 +2,9 @@ -- Edit the sample below or add new transformations -- using "+ Add" in the file browser. -CREATE MATERIALIZED VIEW sample_zones_{{ .project_name }} AS +CREATE MATERIALIZED VIEW sample_zones_{{.project_name}} AS SELECT pickup_zip, SUM(fare_amount) AS total_fare -FROM sample_trips_{{ .project_name }} +FROM sample_trips_{{.project_name}} GROUP BY pickup_zip diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/utilities/utils.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/utilities/utils.py.tmpl new file mode 100644 index 0000000000..f0f4e940f7 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/utilities/utils.py.tmpl @@ -0,0 +1,12 @@ +from pyspark.sql.functions import col, when + + +def distance_km(distance_col): + """Convert distance from miles to kilometers.""" + return distance_col * 1.60934 + + +def format_currency(amount_col): + """Format amount as currency.""" + return when(col(amount_col).isNotNull(), + col(amount_col).cast("decimal(10,2)")) diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/{{.project_name_short}}_etl.pipeline.yml.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/{{.project_name_short}}_etl.pipeline.yml.tmpl new file mode 100644 index 0000000000..3231809364 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/{{.project_name_short}}_etl.pipeline.yml.tmpl @@ -0,0 +1,39 @@ +# The main pipeline for {{.project_name}} + +{{- $serverless := (eq .serverless "yes")}} + +resources: + pipelines: + {{.project_name_short}}_etl: + {{- /* Note that pipeline names must be unique in a worskspace, + * so we use the project name as part as the name. + */}} + name: {{.project_name_short}}_etl +{{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}} + {{- if $serverless }} + ## Catalog is required for serverless compute + catalog: main + {{- else}} + ## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog: + # catalog: ${var.catalog} + {{- end}} +{{- else}} + catalog: ${var.catalog} +{{- end}} + schema: ${var.schema} +{{- if $serverless }} + serverless: true +{{- end}} + root_path: "." + + libraries: + - glob: + include: transformations/** + +{{- if eq .include_python "yes"}} + environment: + dependencies: + # By default we include the .whl file generated for the project's pyproject.toml + # in the root of this project. Any other dependencies can be added there. + - ../../dist/*.whl +{{- end}} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/{{.project_name_short}}_job.job.yml.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/{{.project_name_short}}_job.job.yml.tmpl new file mode 100644 index 0000000000..4bf25eb4c3 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/{{.project_name_short}}_job.job.yml.tmpl @@ -0,0 +1,25 @@ +# The job that triggers {{.project_name_short}}_etl. + +{{- /* For Lakeflow Pipelines, we only include a simple job.yml + * directly in resources/{{.project_name_short}}_etl + */}} + +resources: + jobs: + {{.project_name_short}}_job: + name: {{.project_name_short}}_job + + trigger: + # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger + periodic: + interval: 1 + unit: DAYS + + #email_notifications: + # on_failure: + # - your_email@example.com + + tasks: + - task_key: refresh_pipeline + pipeline_task: + pipeline_id: ${resources.pipelines.{{.project_name_short}}_etl.id} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/README.md.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/README.md.tmpl deleted file mode 100644 index b085a301a6..0000000000 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/README.md.tmpl +++ /dev/null @@ -1,48 +0,0 @@ -{{- if (eq .language "python") -}} - -# {{template `pipeline_name` .}} - -This folder defines all source code for the {{template `pipeline_name` .}} pipeline: - -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `utilities` (optional): Utility functions and Python modules used in this pipeline. -- `data_sources` (optional): View definitions describing the source data for this pipeline. - -## Getting Started - -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_{{ .project_name }}.py" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/python-ref.html. -* Use `Run file` to run and preview a single transformation. -* Use `Run pipeline` to run _all_ transformations in the entire pipeline. -* Use `+ Add` in the file browser to add a new data set definition. -* Use `Schedule` to run the pipeline on a schedule! - -For more tutorials and reference material, see https://docs.databricks.com/dlt. -{{ else -}} - -# {{template `pipeline_name` .}} - -This folder defines all source code for the '{{template `pipeline_name` .}}' pipeline: - -- `explorations`: Ad-hoc notebooks used to explore the data processed by this pipeline. -- `transformations`: All dataset definitions and transformations. -- `data_sources` (optional): View definitions describing the source data for this pipeline. - -## Getting Started - -To get started, go to the `transformations` folder -- most of the relevant source code lives there: - -* By convention, every dataset under `transformations` is in a separate file. -* Take a look at the sample under "sample_trips_{{ .project_name }}.sql" to get familiar with the syntax. - Read more about the syntax at https://docs.databricks.com/dlt/sql-ref.html. -* Use `Run file` to run and preview a single transformation. -* Use `Run pipeline` to run _all_ transformations in the entire pipeline. -* Use `+ Add` in the file browser to add a new data set definition. -* Use `Schedule` to run the pipeline on a schedule! - -For more tutorials and reference material, see https://docs.databricks.com/dlt. -{{ end -}} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/explorations/sample_exploration.ipynb.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/explorations/sample_exploration.ipynb.tmpl deleted file mode 100644 index 967e663fae..0000000000 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/explorations/sample_exploration.ipynb.tmpl +++ /dev/null @@ -1,130 +0,0 @@ -{{- if (eq .language "python") -}} -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "19a992e9-55e0-49e4-abc7-8c92c420dd5b", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "source": [ - "### Example Exploratory Notebook\n", - "\n", - "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", - "\n", - "**Note**: This notebook is not executed as part of the pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "1b0a82fa-3c6a-4f29-bb43-ded1c4fd77c6", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", - "\n", - "display(spark.sql(\"SELECT * FROM {{ .default_catalog}}.{{template `static_dev_schema` .}}.{{ .project_name }}\"))" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "computePreferences": null, - "dashboards": [], - "environmentMetadata": null, - "inputWidgetPreferences": null, - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "sample_exploration", - "widgets": {} - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} -{{ else -}} -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "3bd3cbb1-1518-4d0a-a8d1-f08da3f8840b", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "source": [ - "### Example Exploratory Notebook\n", - "\n", - "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", - "\n", - "**Note**: This notebook is not executed as part of the pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "d30a8e05-bf7a-47e1-982e-b37e64cd6d43", - "showTitle": false, - "tableResultSettingsMap": {}, - "title": "" - } - }, - "outputs": [], - "source": [ - "-- !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", - "\n", - "USE CATALOG `{{.default_catalog}}`;\n", - "USE SCHEMA `{{template `static_dev_schema` .}}`;\n", - "\n", - "SELECT * from {{ .project_name }};" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "computePreferences": null, - "dashboards": [], - "environmentMetadata": null, - "inputWidgetPreferences": null, - "language": "sql", - "notebookMetadata": {}, - "notebookName": "sample_exploration", - "widgets": {} - }, - "language_info": { - "name": "sql" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} -{{ end -}} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/utilities/utils.py b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/utilities/utils.py deleted file mode 100644 index ff039898f0..0000000000 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/utilities/utils.py +++ /dev/null @@ -1,8 +0,0 @@ -from pyspark.sql.functions import udf -from pyspark.sql.types import FloatType - - -@udf(returnType=FloatType()) -def distance_km(distance_miles): - """Convert distance from miles to kilometers (1 mile = 1.60934 km).""" - return distance_miles * 1.60934 diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.pipeline.yml.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.pipeline.yml.tmpl deleted file mode 100644 index 23df081f00..0000000000 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name}}_pipeline/{{.project_name}}.pipeline.yml.tmpl +++ /dev/null @@ -1,12 +0,0 @@ -resources: - pipelines: - {{template `pipeline_name` .}}: - name: {{template `pipeline_name` .}} - serverless: true - channel: "PREVIEW" - catalog: ${var.catalog} - schema: ${var.schema} - root_path: "." - libraries: - - glob: - include: transformations/** diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/conftest.py b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/conftest.py new file mode 100644 index 0000000000..8037a4647c --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/conftest.py @@ -0,0 +1,93 @@ +"""This file configures pytest. + +This file is in the root since it can be used for tests in any place in this +project, including tests under resources/. +""" + +import os, sys, pathlib +from contextlib import contextmanager + + +try: + from databricks.connect import DatabricksSession + from databricks.sdk import WorkspaceClient + from pyspark.sql import SparkSession + import pytest + import json + import csv + import os +except ImportError: + raise ImportError("Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv.") + + +@pytest.fixture() +def spark() -> SparkSession: + """Provide a SparkSession fixture for tests. + + Minimal example: + def test_uses_spark(spark): + df = spark.createDataFrame([(1,)], ["x"]) + assert df.count() == 1 + """ + return DatabricksSession.builder.getOrCreate() + +@pytest.fixture() +def load_fixture(spark: SparkSession): + """Provide a callable to load JSON or CSV from fixtures/ directory. + + Example usage: + + def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert data.count() >= 1 + """ + def _loader(filename: str): + path = pathlib.Path(__file__).parent.parent / "fixtures" / filename + suffix = path.suffix.lower() + if suffix == ".json": + rows = json.loads(path.read_text()) + return spark.createDataFrame(rows) + if suffix == ".csv": + with path.open(newline="") as f: + rows = list(csv.DictReader(f)) + return spark.createDataFrame(rows) + raise ValueError(f"Unsupported fixture type for: {filename}") + return _loader + + +def _enable_fallback_compute(): + """Enable serverless compute if no compute is specified.""" + conf = WorkspaceClient().config + if conf.serverless_compute_id or conf.cluster_id or os.environ.get("SPARK_REMOTE"): + return + + url = "https://docs.databricks.com/dev-tools/databricks-connect/cluster-config" + print("☁️ no compute specified, falling back to serverless compute", file=sys.stderr) + print(f" see {url} for manual configuration", file=sys.stdout) + + os.environ["DATABRICKS_SERVERLESS_COMPUTE_ID"] = "auto" + + +@contextmanager +def _allow_stderr_output(config: pytest.Config): + """Temporarily disable pytest output capture.""" + capman = config.pluginmanager.get_plugin("capturemanager") + if capman: + with capman.global_and_fixture_disabled(): + yield + else: + yield + + +def pytest_configure(config: pytest.Config): + """Configure pytest session.""" + with _allow_stderr_output(config): + _enable_fallback_compute() + + # Initialize Spark session eagerly, so it is available even when + # SparkSession.builder.getOrCreate() is used. For DB Connect 15+, + # we validate version compatibility with the remote cluster. + if hasattr(DatabricksSession.builder, "validateSession"): + DatabricksSession.builder.validateSession().getOrCreate() + else: + DatabricksSession.builder.getOrCreate() diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/sample_taxis_test.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/sample_taxis_test.py.tmpl new file mode 100644 index 0000000000..a782015363 --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/sample_taxis_test.py.tmpl @@ -0,0 +1,8 @@ +from databricks.sdk.runtime import spark +from pyspark.sql import DataFrame +from shared import taxis + + +def test_find_all_taxis(): + results = taxis.find_all_taxis() + assert results.count() > 5 From a7f144795014686b508f78bc98a71947dc9c592b Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 8 Sep 2025 11:35:37 +0200 Subject: [PATCH 2/8] Fix .gitignore issue --- .../{.gitignore => .gitignore.tmpl} | 0 .../sample_exploration.ipynb.tmpl | 63 +++++++++++++++++++ 2 files changed, 63 insertions(+) rename libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/{.gitignore => .gitignore.tmpl} (100%) create mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/explorations/sample_exploration.ipynb.tmpl diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore.tmpl similarity index 100% rename from libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore rename to libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.gitignore.tmpl diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/explorations/sample_exploration.ipynb.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/explorations/sample_exploration.ipynb.tmpl new file mode 100644 index 0000000000..62556be72e --- /dev/null +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/explorations/sample_exploration.ipynb.tmpl @@ -0,0 +1,63 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "19a992e9-55e0-49e4-abc7-8c92c420dd5b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Example Exploratory Notebook\n", + "\n", + "Use this notebook to explore the data generated by the pipeline in your preferred programming language.\n", + "\n", + "**Note**: This notebook is not executed as part of the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1b0a82fa-3c6a-4f29-bb43-ded1c4fd77c6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# !!! Before performing any data analysis, make sure to run the pipeline to materialize the sample datasets. The tables referenced in this notebook depend on that step.\n", + "\n", + "display(spark.sql(\"SELECT * FROM {{.default_catalog}}.{{short_name}}.sample_trips_{{.project_name}}\"))" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "sample_exploration", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From a7ff123c87d96622948ef0be52d575744a64abdb Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sun, 14 Sep 2025 15:44:04 +0200 Subject: [PATCH 3/8] Process reviewer feedback --- .../my_lakeflow_pipelines/.vscode/extensions.json | 3 ++- .../my_lakeflow_pipelines/.vscode/settings.json | 4 ++-- .../lakeflow_pipelines_etl.pipeline.yml | 2 +- .../sample_trips_my_lakeflow_pipelines.py | 5 +---- .../lakeflow_pipelines_etl/utilities/utils.py | 12 ------------ .../my_lakeflow_pipelines/.vscode/extensions.json | 3 ++- .../my_lakeflow_pipelines/.vscode/settings.json | 4 ++-- .../lakeflow_pipelines_etl.pipeline.yml | 2 +- .../{{.project_name}}/.vscode/extensions.json | 3 ++- .../template/{{.project_name}}/.vscode/settings.json | 4 ++-- .../sample_trips_{{.project_name}}.py.tmpl | 5 +---- .../utilities/utils.py.tmpl | 12 ------------ .../{{.project_name_short}}_etl.pipeline.yml.tmpl | 2 +- .../template/{{.project_name}}/tests/conftest.py | 7 ++++++- 14 files changed, 23 insertions(+), 45 deletions(-) delete mode 100644 acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/utilities/utils.py delete mode 100644 libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/utilities/utils.py.tmpl diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/extensions.json b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/extensions.json index 1f39c33087..75a111a6a9 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/extensions.json +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/extensions.json @@ -1,6 +1,7 @@ { "recommendations": [ "databricks.databricks", - "redhat.vscode-yaml" + "redhat.vscode-yaml", + "ms-python.black-formatter" ] } diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json index d8468d7b60..c49593bc59 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/.vscode/settings.json @@ -12,7 +12,7 @@ }, "files.associations": { "**/.gitkeep": "markdown" - } + }, // Pylance settings (VS Code) // Set typeCheckingMode to "basic" to enable type checking! @@ -33,7 +33,7 @@ "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "[python]": { - "editor.defaultFormatter": "ms-python.python", + "editor.defaultFormatter": "ms-python.black-formatter", "editor.formatOnSave": true, }, } diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml index 90779ed09c..6252d74e4e 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml @@ -5,7 +5,7 @@ resources: lakeflow_pipelines_etl: name: lakeflow_pipelines_etl ## Catalog is required for serverless compute - catalog: main + catalog: ${var.catalog} schema: ${var.schema} serverless: true root_path: "." diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.py index 6616f607c5..46868ee7ae 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.py +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.py @@ -1,6 +1,5 @@ import dlt from pyspark.sql.functions import col -from utilities import utils # This file defines a sample transformation. @@ -10,6 +9,4 @@ @dlt.table def sample_trips_my_lakeflow_pipelines(): - return spark.read.table("samples.nyctaxi.trips").withColumn( - "trip_distance_km", utils.distance_km(col("trip_distance")) - ) + return spark.read.table("samples.nyctaxi.trips") diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/utilities/utils.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/utilities/utils.py deleted file mode 100644 index f0f4e940f7..0000000000 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/utilities/utils.py +++ /dev/null @@ -1,12 +0,0 @@ -from pyspark.sql.functions import col, when - - -def distance_km(distance_col): - """Convert distance from miles to kilometers.""" - return distance_col * 1.60934 - - -def format_currency(amount_col): - """Format amount as currency.""" - return when(col(amount_col).isNotNull(), - col(amount_col).cast("decimal(10,2)")) diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/extensions.json b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/extensions.json index 1f39c33087..75a111a6a9 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/extensions.json +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/extensions.json @@ -1,6 +1,7 @@ { "recommendations": [ "databricks.databricks", - "redhat.vscode-yaml" + "redhat.vscode-yaml", + "ms-python.black-formatter" ] } diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json index d8468d7b60..c49593bc59 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/.vscode/settings.json @@ -12,7 +12,7 @@ }, "files.associations": { "**/.gitkeep": "markdown" - } + }, // Pylance settings (VS Code) // Set typeCheckingMode to "basic" to enable type checking! @@ -33,7 +33,7 @@ "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "[python]": { - "editor.defaultFormatter": "ms-python.python", + "editor.defaultFormatter": "ms-python.black-formatter", "editor.formatOnSave": true, }, } diff --git a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml index 90779ed09c..6252d74e4e 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml +++ b/acceptance/bundle/templates/lakeflow-pipelines/sql/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/lakeflow_pipelines_etl.pipeline.yml @@ -5,7 +5,7 @@ resources: lakeflow_pipelines_etl: name: lakeflow_pipelines_etl ## Catalog is required for serverless compute - catalog: main + catalog: ${var.catalog} schema: ${var.schema} serverless: true root_path: "." diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/extensions.json b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/extensions.json index 1f39c33087..75a111a6a9 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/extensions.json +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/extensions.json @@ -1,6 +1,7 @@ { "recommendations": [ "databricks.databricks", - "redhat.vscode-yaml" + "redhat.vscode-yaml", + "ms-python.black-formatter" ] } diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json index d8468d7b60..c49593bc59 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/.vscode/settings.json @@ -12,7 +12,7 @@ }, "files.associations": { "**/.gitkeep": "markdown" - } + }, // Pylance settings (VS Code) // Set typeCheckingMode to "basic" to enable type checking! @@ -33,7 +33,7 @@ "python.testing.unittestEnabled": false, "python.testing.pytestEnabled": true, "[python]": { - "editor.defaultFormatter": "ms-python.python", + "editor.defaultFormatter": "ms-python.black-formatter", "editor.formatOnSave": true, }, } diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.py.tmpl index daa2a78517..e0ee8e645e 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.py.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.py.tmpl @@ -1,6 +1,5 @@ import dlt from pyspark.sql.functions import col -from utilities import utils # This file defines a sample transformation. @@ -10,6 +9,4 @@ from utilities import utils @dlt.table def sample_trips_{{.project_name}}(): - return spark.read.table("samples.nyctaxi.trips").withColumn( - "trip_distance_km", utils.distance_km(col("trip_distance")) - ) + return spark.read.table("samples.nyctaxi.trips") diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/utilities/utils.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/utilities/utils.py.tmpl deleted file mode 100644 index f0f4e940f7..0000000000 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/utilities/utils.py.tmpl +++ /dev/null @@ -1,12 +0,0 @@ -from pyspark.sql.functions import col, when - - -def distance_km(distance_col): - """Convert distance from miles to kilometers.""" - return distance_col * 1.60934 - - -def format_currency(amount_col): - """Format amount as currency.""" - return when(col(amount_col).isNotNull(), - col(amount_col).cast("decimal(10,2)")) diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/{{.project_name_short}}_etl.pipeline.yml.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/{{.project_name_short}}_etl.pipeline.yml.tmpl index 3231809364..368fa95b92 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/{{.project_name_short}}_etl.pipeline.yml.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/{{.project_name_short}}_etl.pipeline.yml.tmpl @@ -12,7 +12,7 @@ resources: {{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}} {{- if $serverless }} ## Catalog is required for serverless compute - catalog: main + catalog: ${var.catalog} {{- else}} ## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog: # catalog: ${var.catalog} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/conftest.py b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/conftest.py index 8037a4647c..4df274fd43 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/conftest.py +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/conftest.py @@ -17,7 +17,9 @@ import csv import os except ImportError: - raise ImportError("Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv.") + raise ImportError( + "Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv." + ) @pytest.fixture() @@ -31,6 +33,7 @@ def test_uses_spark(spark): """ return DatabricksSession.builder.getOrCreate() + @pytest.fixture() def load_fixture(spark: SparkSession): """Provide a callable to load JSON or CSV from fixtures/ directory. @@ -41,6 +44,7 @@ def test_using_fixture(load_fixture): data = load_fixture("my_data.json") assert data.count() >= 1 """ + def _loader(filename: str): path = pathlib.Path(__file__).parent.parent / "fixtures" / filename suffix = path.suffix.lower() @@ -52,6 +56,7 @@ def _loader(filename: str): rows = list(csv.DictReader(f)) return spark.createDataFrame(rows) raise ValueError(f"Unsupported fixture type for: {filename}") + return _loader From d3acf61db353c5b855b6483ea2caeb366feeb274 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sun, 14 Sep 2025 16:08:21 +0200 Subject: [PATCH 4/8] Apply repo formatting Though this is unfortunately inconsistent with the one from Black --- .../transformations/sample_zones_my_lakeflow_pipelines.py | 7 +------ .../template/{{.project_name}}/tests/conftest.py | 4 +--- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py index 4ee08b5cd7..f033fbb279 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py @@ -10,9 +10,4 @@ @dlt.table def sample_zones_sample(): # Read from the "sample_trips" table, then sum all the fares - return ( - spark.read.table(f"sample_trips_my_lakeflow_pipelines") - .groupBy(col("pickup_zip")) - .agg(sum("fare_amount").alias("total_fare")) - ) - + return spark.read.table(f"sample_trips_my_lakeflow_pipelines").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/conftest.py b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/conftest.py index 4df274fd43..77995405da 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/conftest.py +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/tests/conftest.py @@ -17,9 +17,7 @@ import csv import os except ImportError: - raise ImportError( - "Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv." - ) + raise ImportError("Test dependencies not found.\n\nRun tests using 'uv run pytest'. See http://docs.astral.sh/uv to learn more about uv.") @pytest.fixture() From 75bd17f00e251fdd930fc28e2dc5a5a5cfd3bd68 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 15 Sep 2025 14:10:48 +0200 Subject: [PATCH 5/8] REmove spurious newline --- .../transformations/sample_zones_{{.project_name}}.py.tmpl | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl index 0911a4c21d..583704de08 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl @@ -15,4 +15,3 @@ def sample_zones_sample(): .groupBy(col("pickup_zip")) .agg(sum("fare_amount").alias("total_fare")) ) - From d70859233b3a35da84cada300997634caa7c5777 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 15 Sep 2025 14:36:26 +0200 Subject: [PATCH 6/8] Formatting --- .../transformations/sample_zones_my_lakeflow_pipelines.py | 6 +++++- .../my_project/transformations/sample_zones_my_project.py | 6 +++++- .../my_project/transformations/sample_zones_my_project.py | 6 +++++- .../transformations/sample_zones_my_python_project.py | 6 +++++- .../transformations/sample_zones_{{.project_name}}.py.tmpl | 6 +++++- 5 files changed, 25 insertions(+), 5 deletions(-) diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py index f033fbb279..5f631d7968 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py @@ -10,4 +10,8 @@ @dlt.table def sample_zones_sample(): # Read from the "sample_trips" table, then sum all the fares - return spark.read.table(f"sample_trips_my_lakeflow_pipelines").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) + return ( + spark.read.table(f"sample_trips_my_lakeflow_pipelines") + .groupBy(col("pickup_zip")) + .agg(sum("fare_amount").alias("total_fare")) + ) diff --git a/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.py b/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.py index a28f52eef2..280b6dab89 100644 --- a/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.py +++ b/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.py @@ -10,4 +10,8 @@ @dlt.table def sample_zones_my_project(): # Read from the "sample_trips" table, then sum all the fares - return spark.read.table("sample_trips_my_project").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) + return ( + spark.read.table(f"sample_trips_my_project") + .groupBy(col("pickup_zip")) + .agg(sum("fare_amount").alias("total_fare")) + ) diff --git a/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.py b/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.py index a28f52eef2..280b6dab89 100644 --- a/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.py +++ b/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.py @@ -10,4 +10,8 @@ @dlt.table def sample_zones_my_project(): # Read from the "sample_trips" table, then sum all the fares - return spark.read.table("sample_trips_my_project").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) + return ( + spark.read.table(f"sample_trips_my_project") + .groupBy(col("pickup_zip")) + .agg(sum("fare_amount").alias("total_fare")) + ) diff --git a/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.py b/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.py index b1846fda32..20fcd9645e 100644 --- a/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.py +++ b/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.py @@ -10,4 +10,8 @@ @dlt.table def sample_zones_my_python_project(): # Read from the "sample_trips" table, then sum all the fares - return spark.read.table("sample_trips_my_python_project").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) + return ( + spark.read.table(f"sample_trips_my_python_project") + .groupBy(col("pickup_zip")) + .agg(sum("fare_amount").alias("total_fare")) + ) diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.py.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.py.tmpl index 89a81121f8..df63cecd44 100644 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.py.tmpl +++ b/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.py.tmpl @@ -10,4 +10,8 @@ from pyspark.sql.functions import col, sum @dlt.table def sample_zones_{{ .project_name }}(): # Read from the "sample_trips" table, then sum all the fares - return spark.read.table("sample_trips_{{ .project_name }}").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) + return ( + spark.read.table(f"sample_trips_{{.project_name}}") + .groupBy(col("pickup_zip")) + .agg(sum("fare_amount").alias("total_fare")) + ) From fbb161778129d0ce1e9794a6a3d764b3e6f44229 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Tue, 16 Sep 2025 11:32:42 +0200 Subject: [PATCH 7/8] Minor fixes --- .../transformations/sample_zones_my_lakeflow_pipelines.py | 8 ++------ .../my_project/transformations/sample_zones_my_project.py | 6 +----- .../my_project/transformations/sample_zones_my_project.py | 6 +----- .../transformations/sample_zones_my_python_project.py | 6 +----- .../sample_zones_{{.project_name}}.py.tmpl | 6 +----- .../resources/sample_job/sample_job.job.yml.tmpl | 3 +++ .../sample_zones_{{.project_name}}.py.tmpl | 8 ++------ 7 files changed, 11 insertions(+), 32 deletions(-) diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py index 5f631d7968..9b3bde9a64 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py @@ -8,10 +8,6 @@ @dlt.table -def sample_zones_sample(): +def sample_zones_my_lakeflow_pipelines(): # Read from the "sample_trips" table, then sum all the fares - return ( - spark.read.table(f"sample_trips_my_lakeflow_pipelines") - .groupBy(col("pickup_zip")) - .agg(sum("fare_amount").alias("total_fare")) - ) + return spark.read.table(f"sample_trips_my_lakeflow_pipelines").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) diff --git a/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.py b/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.py index 280b6dab89..23601f6fb9 100644 --- a/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.py +++ b/acceptance/pipelines/e2e/output/my_project/transformations/sample_zones_my_project.py @@ -10,8 +10,4 @@ @dlt.table def sample_zones_my_project(): # Read from the "sample_trips" table, then sum all the fares - return ( - spark.read.table(f"sample_trips_my_project") - .groupBy(col("pickup_zip")) - .agg(sum("fare_amount").alias("total_fare")) - ) + return spark.read.table(f"sample_trips_my_project").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) diff --git a/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.py b/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.py index 280b6dab89..23601f6fb9 100644 --- a/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.py +++ b/acceptance/pipelines/init/error-cases/output/my_project/transformations/sample_zones_my_project.py @@ -10,8 +10,4 @@ @dlt.table def sample_zones_my_project(): # Read from the "sample_trips" table, then sum all the fares - return ( - spark.read.table(f"sample_trips_my_project") - .groupBy(col("pickup_zip")) - .agg(sum("fare_amount").alias("total_fare")) - ) + return spark.read.table(f"sample_trips_my_project").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) diff --git a/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.py b/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.py index 20fcd9645e..acc6aff4e0 100644 --- a/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.py +++ b/acceptance/pipelines/init/python/output/my_python_project/transformations/sample_zones_my_python_project.py @@ -10,8 +10,4 @@ @dlt.table def sample_zones_my_python_project(): # Read from the "sample_trips" table, then sum all the fares - return ( - spark.read.table(f"sample_trips_my_python_project") - .groupBy(col("pickup_zip")) - .agg(sum("fare_amount").alias("total_fare")) - ) + return spark.read.table(f"sample_trips_my_python_project").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) diff --git a/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.py.tmpl b/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.py.tmpl index df63cecd44..e83c6c6d70 100644 --- a/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.py.tmpl +++ b/libs/template/templates/cli-pipelines/template/{{.project_name}}/transformations/sample_zones_{{.project_name}}.py.tmpl @@ -10,8 +10,4 @@ from pyspark.sql.functions import col, sum @dlt.table def sample_zones_{{ .project_name }}(): # Read from the "sample_trips" table, then sum all the fares - return ( - spark.read.table(f"sample_trips_{{.project_name}}") - .groupBy(col("pickup_zip")) - .agg(sum("fare_amount").alias("total_fare")) - ) + return spark.read.table(f"sample_trips_{{.project_name}}").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_job.job.yml.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_job.job.yml.tmpl index c78c1a22ac..c99ef2a5fc 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_job.job.yml.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_job.job.yml.tmpl @@ -33,7 +33,10 @@ resources: notebook_task: notebook_path: sample_notebook.ipynb {{- if $serverless}} + {{- /* Environments for notebooks are still in private preview */}} + {{- if $python_package}} environment_key: default + {{- end}} {{- else}} job_cluster_key: job_cluster {{- if $python_package}} diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl index 583704de08..ec78ab5610 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl @@ -8,10 +8,6 @@ from pyspark.sql.functions import col, sum @dlt.table -def sample_zones_sample(): +def sample_zones_{{.project_name}}(): # Read from the "sample_trips" table, then sum all the fares - return ( - spark.read.table(f"sample_trips_{{.project_name}}") - .groupBy(col("pickup_zip")) - .agg(sum("fare_amount").alias("total_fare")) - ) + return spark.read.table(f"sample_trips_{{.project_name}}").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) From e588d1f68812811572d20feeb945e0b5d147f13f Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Tue, 16 Sep 2025 13:26:39 +0200 Subject: [PATCH 8/8] Use pyspark.pipelines --- .../transformations/sample_trips_my_lakeflow_pipelines.py | 4 ++-- .../transformations/sample_zones_my_lakeflow_pipelines.py | 4 ++-- .../template/{{.project_name}}/pyproject.toml.tmpl | 3 +++ .../resources/sample_job/sample_python_file.py.tmpl | 2 ++ .../transformations/sample_trips_{{.project_name}}.py.tmpl | 4 ++-- .../transformations/sample_zones_{{.project_name}}.py.tmpl | 4 ++-- 6 files changed, 13 insertions(+), 8 deletions(-) diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.py index 46868ee7ae..6bbda004ab 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.py +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_trips_my_lakeflow_pipelines.py @@ -1,4 +1,4 @@ -import dlt +from pyspark import pipelines as dp from pyspark.sql.functions import col @@ -7,6 +7,6 @@ # using "+ Add" in the file browser. -@dlt.table +@dp.table def sample_trips_my_lakeflow_pipelines(): return spark.read.table("samples.nyctaxi.trips") diff --git a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py index 9b3bde9a64..b0c11bbdd7 100644 --- a/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py +++ b/acceptance/bundle/templates/lakeflow-pipelines/python/output/my_lakeflow_pipelines/resources/lakeflow_pipelines_etl/transformations/sample_zones_my_lakeflow_pipelines.py @@ -1,4 +1,4 @@ -import dlt +from pyspark import pipelines as dp from pyspark.sql.functions import col, sum @@ -7,7 +7,7 @@ # using "+ Add" in the file browser. -@dlt.table +@dp.table def sample_zones_my_lakeflow_pipelines(): # Read from the "sample_trips" table, then sum all the fares return spark.read.table(f"sample_trips_my_lakeflow_pipelines").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare")) diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/pyproject.toml.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/pyproject.toml.tmpl index 8733be6999..6ea4c82088 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/pyproject.toml.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/pyproject.toml.tmpl @@ -30,3 +30,6 @@ build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] packages = ["lib"] sources = ["lib"] + +[tool.black] +line-length = 125 diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_python_file.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_python_file.py.tmpl index 818837eecb..9c53aaef19 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_python_file.py.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/sample_job/sample_python_file.py.tmpl @@ -2,6 +2,7 @@ import argparse from datetime import datetime from shared import taxis + def main(): parser = argparse.ArgumentParser() parser.add_argument("--catalog", default="{{.default_catalog}}") @@ -15,5 +16,6 @@ def main(): print(f"Wrote {df.count()} taxi records to {table_name}") + if __name__ == "__main__": main() diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.py.tmpl index e0ee8e645e..4f0fde3c2e 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.py.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_trips_{{.project_name}}.py.tmpl @@ -1,4 +1,4 @@ -import dlt +from pyspark import pipelines as dp from pyspark.sql.functions import col @@ -7,6 +7,6 @@ from pyspark.sql.functions import col # using "+ Add" in the file browser. -@dlt.table +@dp.table def sample_trips_{{.project_name}}(): return spark.read.table("samples.nyctaxi.trips") diff --git a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl index ec78ab5610..95311a27f1 100644 --- a/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl +++ b/libs/template/templates/lakeflow-pipelines/template/{{.project_name}}/resources/{{.project_name_short}}_etl/transformations/sample_zones_{{.project_name}}.py.tmpl @@ -1,4 +1,4 @@ -import dlt +from pyspark import pipelines as dp from pyspark.sql.functions import col, sum @@ -7,7 +7,7 @@ from pyspark.sql.functions import col, sum # using "+ Add" in the file browser. -@dlt.table +@dp.table def sample_zones_{{.project_name}}(): # Read from the "sample_trips" table, then sum all the fares return spark.read.table(f"sample_trips_{{.project_name}}").groupBy(col("pickup_zip")).agg(sum("fare_amount").alias("total_fare"))