diff --git a/.github/actions/trufflehog_logs_scan/action.yml b/.github/actions/trufflehog_logs_scan/action.yml
new file mode 100644
index 0000000000..7761785136
--- /dev/null
+++ b/.github/actions/trufflehog_logs_scan/action.yml
@@ -0,0 +1,15 @@
+---
+# Composite Action to run pre-test functions for task runner end to end tests
+
+name: 'TruffleHog Logs Scan'
+description: 'Run TruffleHog scan on logs'
+
+runs:
+ using: 'composite'
+ steps:
+ - name: Run trufflehog for all log file in results
+ id: trufflehog_scan
+ run: |
+ export PYTHONPATH="$PYTHONPATH:."
+ python .github/config/parse_task_runner_logs.py --log_dir ~/results
+ shell: bash
\ No newline at end of file
diff --git a/.github/config/parse_task_runner_logs.py b/.github/config/parse_task_runner_logs.py
new file mode 100644
index 0000000000..4a24111d3a
--- /dev/null
+++ b/.github/config/parse_task_runner_logs.py
@@ -0,0 +1,105 @@
+# Copyright 2020-2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import subprocess
+import json
+import logging
+import argparse
+
+# Configure logging
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+def get_log_files(log_dir):
+ """
+ Get all .log files in the specified directory and its subdirectories.
+ Save the list of log files to a text file.
+ Args:
+ log_dir (str): Path to the directory containing log files.
+
+ Returns:
+ list: List of .log files found in the directory.
+ """
+ if not os.path.exists(log_dir):
+ logger.error(f"Directory '{log_dir}' does not exist.")
+ exit(1)
+
+ log_files = []
+ for root, _, files in os.walk(log_dir):
+ for file in files:
+ if file.endswith(".log"):
+ log_files.append(os.path.join(root, file))
+ return log_files
+
+
+def run_trufflehog(log_file):
+ """
+ Run TruffleHog on the specified log file and return the number of unverified secrets found.
+ Args:
+ log_file (str): Path to the log file to scan.
+ Returns:
+ int: Number of unverified secrets found in the log file.
+ """
+ try:
+ # Run TruffleHog with JSON output and capture the output
+ cmd = f'trufflehog filesystem {log_file} --no-update --json'
+ result = subprocess.run(
+ cmd, capture_output=True, shell=True, text=True, timeout=30, check=True
+ )
+ # Extract the last JSON object from the output
+ lines = result.stderr.strip().split("\n")
+ last_json = json.loads(lines[-1])
+ # throw error if las_json not have unverified_secrets
+ if "unverified_secrets" not in last_json:
+ raise json.JSONDecodeError("unverified_secrets not found in JSON output", "", 0)
+ else:
+ logger.info(f"Unverified secrets found: {last_json['unverified_secrets']}")
+ # Return the unverified_secrets count
+ return last_json.get("unverified_secrets", 0)
+ except subprocess.CalledProcessError as e:
+ logger.error(f"Error running TruffleHog on file {log_file}: {e}")
+ raise e
+ except json.JSONDecodeError as e:
+ logger.error(f"Error decoding JSON output for file {log_file}: {e}")
+ raise e
+
+
+def main(log_dir):
+ """
+ Main function to scan log files for unverified secrets.
+ Args:
+ log_dir (str): Path to the directory containing log files.
+ """
+ # Get all .log files
+ log_files = get_log_files(log_dir)
+ if not log_files:
+ logger.info("No .log files found.")
+ return
+
+ # Scan each log file with TruffleHog
+ for log_file in log_files:
+ logger.info(f"Scanning file: {log_file}")
+ unverified_secrets = run_trufflehog(log_file)
+
+ if unverified_secrets > 0:
+ logger.error(f"File '{log_file}' contains {unverified_secrets} unverified secrets.")
+ exit(1)
+
+ logger.info("All files scanned successfully. No unverified secrets found.")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Scan log files for unverified secrets.")
+ parser.add_argument(
+ "--log_dir",
+ type=str,
+ required=True,
+ help="Path to the directory containing log files."
+ )
+ args = parser.parse_args()
+ log_dir = os.path.expanduser(args.log_dir)
+ main(log_dir)
diff --git a/.github/workflows/pq_pipeline.yml b/.github/workflows/pq_pipeline.yml
index f0afaf4877..9ba76e1087 100644
--- a/.github/workflows/pq_pipeline.yml
+++ b/.github/workflows/pq_pipeline.yml
@@ -19,12 +19,34 @@ concurrency:
group: ${{ github.workflow }}-${{ github.base_ref }}-${{ github.head_ref }}-${{ github.actor }}
jobs:
+ set_commit_id_for_all_jobs: # Do not change this job name, it is used by other jobs to get the commit ID
+ name: Get/Set Commit ID
+ if: github.event.pull_request.draft == false
+ runs-on: ubuntu-22.04
+ outputs:
+ commit_id: ${{ steps.set_commit_id.outputs.commit_id }}
+ steps:
+ - name: Checkout OpenFL repository
+ uses: actions/checkout@v4
+
+ - name: Set commit ID
+ id: set_commit_id
+ run: |
+ echo "commit_id=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT
+
+ - name: Print commit ID to summary
+ run: |
+ echo "Commit ID used: ${{ steps.set_commit_id.outputs.commit_id }}" >> $GITHUB_STEP_SUMMARY
+
wf_mnist_local_runtime:
if: |
(github.event_name == 'schedule' && github.repository_owner == 'securefederatedai') ||
(github.event_name == 'workflow_dispatch')
name: Workflow MNIST Local Runtime
+ needs: set_commit_id_for_all_jobs
uses: ./.github/workflows/workflow_interface_101_mnist.yml
+ with:
+ commit_id: ${{ needs.set_commit_id_for_all_jobs.outputs.commit_id }}
wf_watermark_e2e:
if: |
@@ -33,6 +55,8 @@ jobs:
name: Workflow Watermarking Federated Runtime E2E
needs: wf_mnist_local_runtime
uses: ./.github/workflows/wf_watermarking_fed_runtime.yml
+ with:
+ commit_id: ${{ needs.set_commit_id_for_all_jobs.outputs.commit_id }}
wf_secagg_e2e:
if: |
@@ -41,13 +65,18 @@ jobs:
name: Workflow Secure Aggregation Federated Runtime E2E
needs: wf_watermark_e2e
uses: ./.github/workflows/wf_secagg_fed_runtime.yml
+ with:
+ commit_id: ${{ needs.set_commit_id_for_all_jobs.outputs.commit_id }}
task_runner_e2e:
if: |
(github.event_name == 'schedule' && github.repository_owner == 'securefederatedai') ||
(github.event_name == 'workflow_dispatch')
name: TaskRunner E2E
+ needs: set_commit_id_for_all_jobs
uses: ./.github/workflows/task_runner_basic_e2e.yml
+ with:
+ commit_id: ${{ needs.set_commit_id_for_all_jobs.outputs.commit_id }}
task_runner_resiliency_e2e:
if: |
@@ -56,6 +85,8 @@ jobs:
name: TaskRunner Resiliency E2E
needs: task_runner_e2e
uses: ./.github/workflows/task_runner_resiliency_e2e.yml
+ with:
+ commit_id: ${{ needs.set_commit_id_for_all_jobs.outputs.commit_id }}
task_runner_fedeval_e2e:
if: |
@@ -64,13 +95,18 @@ jobs:
name: TaskRunner FedEval E2E
needs: task_runner_e2e
uses: ./.github/workflows/task_runner_fedeval_e2e.yml
+ with:
+ commit_id: ${{ needs.set_commit_id_for_all_jobs.outputs.commit_id }}
task_runner_secure_agg_e2e:
if: |
(github.event_name == 'schedule' && github.repository_owner == 'securefederatedai') ||
(github.event_name == 'workflow_dispatch')
name: TaskRunner Secure Aggregation E2E
+ needs: set_commit_id_for_all_jobs
uses: ./.github/workflows/task_runner_secure_agg_e2e.yml
+ with:
+ commit_id: ${{ needs.set_commit_id_for_all_jobs.outputs.commit_id }}
task_runner_straggler_e2e:
if: |
@@ -79,6 +115,8 @@ jobs:
name: TaskRunner Straggler E2E
needs: task_runner_resiliency_e2e
uses: ./.github/workflows/task_runner_straggler_e2e.yml
+ with:
+ commit_id: ${{ needs.set_commit_id_for_all_jobs.outputs.commit_id }}
# run basic dockerized test with keras/mnist
task_runner_dockerized_e2e:
@@ -88,6 +126,8 @@ jobs:
name: TaskRunner Dockerized E2E
needs: task_runner_straggler_e2e
uses: ./.github/workflows/task_runner_dockerized_ws_e2e.yml
+ with:
+ commit_id: ${{ needs.set_commit_id_for_all_jobs.outputs.commit_id }}
# run testssl for task runner
task_runner_secret_ssl_e2e:
@@ -95,4 +135,18 @@ jobs:
(github.event_name == 'schedule' && github.repository_owner == 'securefederatedai') ||
(github.event_name == 'workflow_dispatch')
name: TaskRunner Secret SSL E2E
- uses: ./.github/workflows/task_runner_secret_tls_e2e.yml
\ No newline at end of file
+ needs: set_commit_id_for_all_jobs
+ uses: ./.github/workflows/task_runner_secret_tls_e2e.yml
+ with:
+ commit_id: ${{ needs.set_commit_id_for_all_jobs.outputs.commit_id }}
+
+ # run flower app with pytorch
+ task_runner_flower_app_pytorch:
+ if: |
+ (github.event_name == 'schedule' && github.repository_owner == 'securefederatedai') ||
+ (github.event_name == 'workflow_dispatch')
+ name: TaskRunner Flower App Pytorch E2E
+ needs: set_commit_id_for_all_jobs
+ uses: ./.github/workflows/task_runner_flower_e2e.yml
+ with:
+ commit_id: ${{ needs.set_commit_id_for_all_jobs.outputs.commit_id }}
diff --git a/.github/workflows/task_runner_basic_e2e.yml b/.github/workflows/task_runner_basic_e2e.yml
index 20fece2cc6..7da2f0562c 100644
--- a/.github/workflows/task_runner_basic_e2e.yml
+++ b/.github/workflows/task_runner_basic_e2e.yml
@@ -5,6 +5,10 @@ name: Task_Runner_E2E # Please do not modify the name as it is used in the comp
on:
workflow_call:
+ inputs:
+ commit_id:
+ required: false
+ type: string
workflow_dispatch:
inputs:
num_rounds:
@@ -63,6 +67,7 @@ env:
MODEL_NAME: ${{ inputs.model_name || 'all' }}
PYTHON_VERSION: ${{ inputs.python_version || 'all' }}
JOBS_TO_RUN: ${{ inputs.jobs_to_run || 'all' }}
+ COMMIT_ID: ${{ inputs.commit_id || github.sha }} # use commit_id from the calling workflow
jobs:
input_selection:
@@ -151,9 +156,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
@@ -201,9 +204,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
@@ -251,9 +252,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
@@ -301,9 +300,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
@@ -342,6 +339,8 @@ jobs:
- name: Checkout OpenFL repository
id: checkout_openfl
uses: actions/checkout@v4
+ with:
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
@@ -378,6 +377,8 @@ jobs:
- name: Checkout OpenFL repository
id: checkout_openfl
uses: actions/checkout@v4
+ with:
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
diff --git a/.github/workflows/task_runner_dockerized_ws_e2e.yml b/.github/workflows/task_runner_dockerized_ws_e2e.yml
index e2701c6426..25cfa9cc34 100644
--- a/.github/workflows/task_runner_dockerized_ws_e2e.yml
+++ b/.github/workflows/task_runner_dockerized_ws_e2e.yml
@@ -5,6 +5,10 @@ name: Task_Runner_Dockerized_E2E # Please do not modify the name as it is used
on:
workflow_call:
+ inputs:
+ commit_id:
+ required: false
+ type: string
workflow_dispatch:
inputs:
num_rounds:
@@ -37,6 +41,7 @@ env:
NUM_ROUNDS: ${{ inputs.num_rounds || '5' }}
NUM_COLLABORATORS: ${{ inputs.num_collaborators || '2' }}
JOBS_TO_RUN: ${{ inputs.jobs_to_run || 'all' }}
+ COMMIT_ID: ${{ inputs.commit_id || github.sha }} # default to current commit if not provided
jobs:
input_selection:
@@ -72,9 +77,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
@@ -115,9 +118,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
@@ -158,9 +159,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
@@ -201,9 +200,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
diff --git a/.github/workflows/task_runner_fedeval_e2e.yml b/.github/workflows/task_runner_fedeval_e2e.yml
index c922c1d810..39c39af3ae 100644
--- a/.github/workflows/task_runner_fedeval_e2e.yml
+++ b/.github/workflows/task_runner_fedeval_e2e.yml
@@ -5,6 +5,10 @@ name: Task_Runner_FedEval_E2E # Please do not modify the name as it is used in
on:
workflow_call:
+ inputs:
+ commit_id:
+ required: false
+ type: string
workflow_dispatch:
inputs:
num_rounds:
@@ -25,6 +29,7 @@ permissions:
env:
NUM_ROUNDS: ${{ inputs.num_rounds || '5' }}
NUM_COLLABORATORS: ${{ inputs.num_collaborators || '2' }}
+ COMMIT_ID: ${{ inputs.commit_id || github.sha }} # use commit_id from the calling workflow
jobs:
test_with_tls:
@@ -51,9 +56,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
@@ -97,9 +100,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
@@ -143,9 +144,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
diff --git a/.github/workflows/task_runner_flower_e2e.yml b/.github/workflows/task_runner_flower_e2e.yml
new file mode 100644
index 0000000000..dce1a099f1
--- /dev/null
+++ b/.github/workflows/task_runner_flower_e2e.yml
@@ -0,0 +1,74 @@
+---
+# Task Runner Flower E2E tests for bare metal approach
+
+name: Task_Runner_Flower_E2E # Please do not modify the name as it is used in the composite action
+
+on:
+ workflow_call:
+ inputs:
+ commit_id:
+ required: false
+ type: string
+ workflow_dispatch:
+ inputs:
+ num_collaborators:
+ description: "Number of collaborators"
+ required: false
+ default: "2"
+ type: string
+ python_version:
+ description: "Python version"
+ required: false
+ default: "3.10"
+ type: choice
+ options:
+ - "3.10"
+ - "3.11"
+ - "3.12"
+
+permissions:
+ contents: read
+
+# Environment variables common for all the jobs
+# DO NOT use double quotes for the values of the environment variables
+env:
+ NUM_ROUNDS: 1
+ NUM_COLLABORATORS: ${{ inputs.num_collaborators || 2 }}
+ MODEL_NAME: "flower-app-pytorch"
+ PYTHON_VERSION: ${{ inputs.python_version || '3.10' }}
+ COMMIT_ID: ${{ inputs.commit_id || github.sha }} # use commit_id from the calling workflow
+
+jobs:
+ test_flower_app_pytorch:
+ name: With TLS (flower-app-pytorch, 3.10) # DO NOT change this name.
+ runs-on: ubuntu-22.04
+ timeout-minutes: 30
+ if: |
+ (github.event_name == 'schedule' && github.repository_owner == 'securefederatedai') ||
+ (github.event_name == 'workflow_dispatch') ||
+ (github.event.pull_request.draft == false)
+
+ steps:
+ - name: Checkout OpenFL repository
+ id: checkout_openfl
+ uses: actions/checkout@v4
+ with:
+ ref: ${{ env.COMMIT_ID }}
+
+ - name: Pre test run
+ uses: ./.github/actions/tr_pre_test_run
+ if: ${{ always() }}
+
+ - name: Run Flower App Pytorch
+ id: run_tests
+ run: |
+ python -m pytest -s tests/end_to_end/test_suites/tr_flower_tests.py \
+ -m task_runner_basic --model_name ${{ env.MODEL_NAME }} \
+ --num_rounds ${{ env.NUM_ROUNDS }} --num_collaborators ${{ env.NUM_COLLABORATORS }}
+ echo "Flower app pytorch test run completed"
+
+ - name: Post test run
+ uses: ./.github/actions/tr_post_test_run
+ if: ${{ always() }}
+ with:
+ test_type: "With_TLS_Flower"
diff --git a/.github/workflows/task_runner_resiliency_e2e.yml b/.github/workflows/task_runner_resiliency_e2e.yml
index 756cb347a6..c001e0b4e7 100644
--- a/.github/workflows/task_runner_resiliency_e2e.yml
+++ b/.github/workflows/task_runner_resiliency_e2e.yml
@@ -5,6 +5,10 @@ name: Task_Runner_Resiliency_E2E # Please do not modify the name as it is used
on:
workflow_call:
+ inputs:
+ commit_id:
+ required: false
+ type: string
workflow_dispatch:
inputs:
num_rounds:
@@ -46,6 +50,7 @@ env:
NUM_COLLABORATORS: ${{ inputs.num_collaborators || 2 }}
MODEL_NAME: ${{ inputs.model_name || 'torch/mnist' }}
PYTHON_VERSION: ${{ inputs.python_version || '3.10' }}
+ COMMIT_ID: ${{ inputs.commit_id || github.sha }} # use commit_id from the calling workflow
jobs:
input_selection:
@@ -98,9 +103,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
diff --git a/.github/workflows/task_runner_secret_tls_e2e.yml b/.github/workflows/task_runner_secret_tls_e2e.yml
index c431ed0b10..e9c0453aa1 100644
--- a/.github/workflows/task_runner_secret_tls_e2e.yml
+++ b/.github/workflows/task_runner_secret_tls_e2e.yml
@@ -5,12 +5,33 @@ name: Task_Runner_Secret_SSL_E2E # Please do not modify the name as it is used
on:
workflow_call:
+ inputs:
+ commit_id:
+ required: false
+ type: string
workflow_dispatch:
permissions:
contents: read
+env:
+ COMMIT_ID: ${{ inputs.commit_id || github.sha }}
+
jobs:
+ trufflehog_repo_scan:
+ name: TruffleHog Repo Scan
+ runs-on: ubuntu-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ - name: Scan git repo for secrets
+ id: trufflehog
+ uses: trufflesecurity/trufflehog@main
+ with:
+ extra_args: --debug --only-verified
+
test_secret_ssl:
name: Secret SSL Check (torch/mnist, 3.10)
if: |
@@ -22,20 +43,19 @@ jobs:
env:
MODEL_NAME: 'torch/mnist'
PYTHON_VERSION: '3.10'
- NUM_ROUNDS: 10
+ NUM_ROUNDS: 8
NUM_COLLABORATORS: 2
steps:
- name: Checkout OpenFL repository
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Install Secret SSL dependencies
run: |
sudo apt install testssl.sh
+ curl -sSfL https://raw.githubusercontent.com/trufflesecurity/trufflehog/main/scripts/install.sh | sudo sh -s -- -b /usr/local/bin
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
@@ -49,8 +69,12 @@ jobs:
--num_rounds ${{ env.NUM_ROUNDS }} --num_collaborators ${{ env.NUM_COLLABORATORS }}
echo "Task runner end to end test run completed"
+ - name: TruffleHog Logs Scan
+ uses: ./.github/actions/trufflehog_logs_scan
+ if: ${{ always() }}
+
- name: Post test run
uses: ./.github/actions/tr_post_test_run
if: ${{ always() }}
with:
- test_type: "With_Secret_SSL"
\ No newline at end of file
+ test_type: "With_Secret_SSL"
diff --git a/.github/workflows/task_runner_secure_agg_e2e.yml b/.github/workflows/task_runner_secure_agg_e2e.yml
index cfc58ff439..66a126c9e0 100644
--- a/.github/workflows/task_runner_secure_agg_e2e.yml
+++ b/.github/workflows/task_runner_secure_agg_e2e.yml
@@ -5,11 +5,18 @@ name: Task_Runner_Secure_Agg_E2E # Please do not modify the name as it is used
on:
workflow_call:
+ inputs:
+ commit_id:
+ required: false
+ type: string
workflow_dispatch:
permissions:
contents: read
+env:
+ COMMIT_ID: ${{ inputs.commit_id || github.sha }} # use commit_id from the calling workflow
+
jobs:
test_secure_aggregation:
name: Secure Aggregation (torch/mnist, 3.10)
@@ -29,9 +36,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Install Secure Aggregation dependencies
run: |
@@ -53,4 +58,4 @@ jobs:
uses: ./.github/actions/tr_post_test_run
if: ${{ always() }}
with:
- test_type: "With_Secure_Agg"
\ No newline at end of file
+ test_type: "With_Secure_Agg"
diff --git a/.github/workflows/task_runner_straggler_e2e.yml b/.github/workflows/task_runner_straggler_e2e.yml
index 39932df31c..90ab99728a 100644
--- a/.github/workflows/task_runner_straggler_e2e.yml
+++ b/.github/workflows/task_runner_straggler_e2e.yml
@@ -5,11 +5,18 @@ name: Task_Runner_Straggler_E2E # Please do not modify the name as it is used i
on:
workflow_call:
+ inputs:
+ commit_id:
+ required: false
+ type: string
workflow_dispatch:
permissions:
contents: read
+env:
+ COMMIT_ID: ${{ inputs.commit_id || github.sha }} # use commit_id from the calling workflow
+
jobs:
test_straggler_percentage_policy:
name: Percentage Policy (torch/mnist_straggler_check, 3.10)
@@ -26,9 +33,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
@@ -61,9 +66,7 @@ jobs:
id: checkout_openfl
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Pre test run
uses: ./.github/actions/tr_pre_test_run
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
index 3c8c901581..d0c74f1826 100644
--- a/.github/workflows/trivy.yml
+++ b/.github/workflows/trivy.yml
@@ -53,7 +53,7 @@ jobs:
- name: Run Trivy vulnerability scanner for Docker image (JSON Output)
id: trivy-scan
- uses: aquasecurity/trivy-action@0.29.0
+ uses: aquasecurity/trivy-action@0.30.0
with:
image-ref: 'docker.io/securefederatedai/openfl:${{ github.sha }}'
format: 'json'
@@ -97,7 +97,7 @@ jobs:
path: trivy-code-spdx-results.json
- name: Run Trivy vulnerability scanner for Docker image (SPDX-JSON Output)
- uses: aquasecurity/trivy-action@0.29.0
+ uses: aquasecurity/trivy-action@0.30.0
with:
image-ref: 'docker.io/securefederatedai/openfl:${{ github.sha }}'
format: 'spdx-json'
diff --git a/.github/workflows/wf_secagg_fed_runtime.yml b/.github/workflows/wf_secagg_fed_runtime.yml
index d0c23d69a2..91a6e65361 100644
--- a/.github/workflows/wf_secagg_fed_runtime.yml
+++ b/.github/workflows/wf_secagg_fed_runtime.yml
@@ -4,12 +4,19 @@
name: Secure Aggregation Runtime E2E
on:
- workflow_dispatch:
workflow_call:
+ inputs:
+ commit_id:
+ required: false
+ type: string
+ workflow_dispatch:
permissions:
contents: read
+env:
+ COMMIT_ID: ${{ inputs.commit_id || github.sha }} # use commit_id from the calling workflow
+
jobs:
test_secure_agg_notebook:
name: WF SecAgg Without TLS
@@ -23,9 +30,7 @@ jobs:
- name: Checkout OpenFL repository
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }}
- name: Set up Python
uses: actions/setup-python@v5
diff --git a/.github/workflows/wf_watermarking_fed_runtime.yml b/.github/workflows/wf_watermarking_fed_runtime.yml
index a3f0314c43..613e2e2cfe 100644
--- a/.github/workflows/wf_watermarking_fed_runtime.yml
+++ b/.github/workflows/wf_watermarking_fed_runtime.yml
@@ -5,11 +5,18 @@ name: Federated Runtime Watermarking E2E
on:
workflow_call:
+ inputs:
+ commit_id:
+ required: false
+ type: string
workflow_dispatch:
permissions:
contents: read
+env:
+ COMMIT_ID: ${{ inputs.commit_id || github.sha }} # use commit_id from the calling workflow
+
jobs:
test_federated_runtime_301_watermarking_notebook:
name: WF Watermarking Without TLS
@@ -23,9 +30,7 @@ jobs:
- name: Checkout OpenFL repository
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }} # use commit_id from the calling workflow
- name: Set up Python
uses: actions/setup-python@v5
diff --git a/.github/workflows/workflow_interface_101_mnist.yml b/.github/workflows/workflow_interface_101_mnist.yml
index af2da742ab..fa425f92f3 100644
--- a/.github/workflows/workflow_interface_101_mnist.yml
+++ b/.github/workflows/workflow_interface_101_mnist.yml
@@ -5,11 +5,18 @@ name: Workflow MNIST Local Runtime E2E
on:
workflow_call:
+ inputs:
+ commit_id:
+ required: false
+ type: string
workflow_dispatch:
permissions:
contents: read
+env:
+ COMMIT_ID: ${{ inputs.commit_id || github.sha }} # use commit_id from the calling workflow
+
jobs:
wf_local_101_mnist:
name: WF Local Without TLS
@@ -20,9 +27,7 @@ jobs:
- name: Checkout OpenFL repository
uses: actions/checkout@v4
with:
- fetch-depth: 2 # needed for detecting changes
- submodules: "true"
- token: ${{ secrets.GITHUB_TOKEN }}
+ ref: ${{ env.COMMIT_ID }} # use commit_id from the calling workflow
- name: Set up Python
uses: actions/setup-python@v5
diff --git a/README.md b/README.md
index 8031180916..b2c621c44a 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@
[](https://pypi.org/project/openfl/)
+[](https://anaconda.org/conda-forge/openfl)
[](https://pepy.tech/project/openfl)
[](https://openfl.readthedocs.io/en/latest/?badge=latest)
[

](https://join.slack.com/t/openfl/shared_invite/zt-ovzbohvn-T5fApk05~YS_iZhjJ5yaTw)
@@ -47,6 +48,10 @@ Install via PyPI (latest stable release):
```
pip install -U openfl
```
+Or via conda:
+```
+conda install conda-forge::openfl
+```
For more installation options, checkout the [installation guide](https://openfl.readthedocs.io/en/latest/installation.html).
## Features
@@ -96,4 +101,4 @@ This project is licensed under [Apache License Version 2.0](LICENSE). By contrib
doi={10.1088/1361-6560/ac97d9},
publisher={IOP Publishing}
}
-```
+```
\ No newline at end of file
diff --git a/docs/about/blogs_publications.md b/docs/about/blogs_publications.md
index 548c9df0aa..0ed9bb16bd 100644
--- a/docs/about/blogs_publications.md
+++ b/docs/about/blogs_publications.md
@@ -3,6 +3,11 @@
### 2025
* [Federated Discrete Denoising Diffusion Model for Molecular Generation with OpenFL](https://arxiv.org/abs/2501.12523)
* [Confidential Federated Learning with OpenFL - Intel Blog](https://community.intel.com/t5/Blogs/Tech-Innovation/Artificial-Intelligence-AI/Confidential-Federated-Learning-with-OpenFL/post/1658148)
+* [Collaborative evaluation for performance assessment of medical imaging applications](https://www.sciencedirect.com/science/article/abs/pii/B9780443237614000195)
+
+### 2024
+* [Pan-Cancer Tumor Infiltrating Lymphocyte Detection based on Federated Learning](https://ieeexplore.ieee.org/abstract/document/10825083)
+* [Privacy preservation for federated learning in health care](https://www.cell.com/patterns/fulltext/S2666-3899(24)00082-5)
### 2022
* [Federated learning enables big data for rare cancer boundary detection, Dec 2022](https://www.nature.com/articles/s41467-022-33407-5)
@@ -10,9 +15,11 @@
* [OpenFL: the open federated learning library, Oct 2022](https://iopscience.iop.org/article/10.1088/1361-6560/ac97d9/pdf)
* [Federated Learning With OpenFL for Microservices Applications, Aug 2022](https://blogs.vmware.com/opensource/2022/08/31/federated-learning-with-openfl-for-microservices-applications-2/)
* [A Path Towards Secure Federated Learning, Apr 2022](https://medium.com/openfl/a-path-towards-secure-federated-learning-c2fb16d5e66e)
+* [MammoFL: Mammographic Breast Density Estimation using Federated Learning](https://arxiv.org/abs/2206.05575)
+* [The Federated Tumor Segmentation (FeTS) tool: an open-source solution to further solid tumor research](https://iopscience.iop.org/article/10.1088/1361-6560/ac9449/meta)
### 2021
* [Go Federated with OpenFL: Put your Deep Learning pipeline on Federated rails, Oct 2021](https://towardsdatascience.com/go-federated-with-openfl-8bc145a5ead1)
### 2020
-* [Federated learning in medicine: facilitating multi-institutional collaborations without sharing patient data, Jul 2020](https://www.nature.com/articles/s41598-020-69250-1)
+* [Federated learning in medicine: facilitating multi-institutional collaborations without sharing patient data, Jul 2020](https://www.nature.com/articles/s41598-020-69250-1)
\ No newline at end of file
diff --git a/docs/releases.md b/docs/releases.md
index bcd0f19a39..4bbe4469d3 100644
--- a/docs/releases.md
+++ b/docs/releases.md
@@ -1,4 +1,20 @@
# Releases
+## 1.8
+[Full Release Notes](https://github.com/securefederatedai/openfl/releases/tag/v1.8)
+
+### New Features
+- [**Secure Aggregation**](https://openfl.readthedocs.io/en/latest/about/features_index/secure_aggregation.html): A privacy-preserving aggregation algorithm for Federated Learning, based on secure multiparty computation (MPC) that helps protect intermediate model updates from introspection. OpenFL now supports Secure Aggregation via both Task Runner API and Workflow API.
+
+- [**OpenFL/Flower Interop**](https://github.com/securefederatedai/openfl/tree/develop/openfl-workspace/flower-app-pytorch): Showcasing how FL experiments defined using the Flower API can be run as OpenFL federations via TaskRunner API. This enables combining Flower's extensive library of FL algorithms with OpenFL's advanced security features, including trusted execution, secure communication, and explicit safeguards against data exfiltration.
+
+- [**Federated Evaluation**](https://openfl.readthedocs.io/en/latest/about/features_index/fed_eval.html): Enabling seamless switching from learning to evaluation in Task Runner API without redistributing the FL plan, which is particularly advantageous in large, geo-distributed federations. Furthermore, Federated Evaluation is now available via the [Workflow API](https://github.com/securefederatedai/openfl/blob/develop/openfl-tutorials/experimental/workflow/405_MNIST_FederatedEvaluation.ipynb).
+
+### Enhanced Developer Experience
+- **Removing Legacy APIs**: The Python Native API and the Interactive API have been removed, along with the accompanying examples and documentation.
+
+- **ML Frameworks Integration**: Upgraded PyTorch-based FL workspaces to 2.4.1, and provided additional [Keras 3 back-ends](https://github.com/securefederatedai/openfl/tree/develop/openfl-workspace/keras) (incl. Jax and PyTorch).
+
+- **Enhanced Resilience**: OpenFL Task Runner API experiments can now recover from Collaborator and Aggregator restarts.
## 1.7
[Full Release Notes](https://github.com/securefederatedai/openfl/releases/tag/v1.7)
diff --git a/docs/tutorials/taskrunner.ipynb b/docs/tutorials/taskrunner.ipynb
index fa7dd37bee..1b0678f300 100644
--- a/docs/tutorials/taskrunner.ipynb
+++ b/docs/tutorials/taskrunner.ipynb
@@ -26,8 +26,8 @@
"metadata": {},
"source": [
" collaborators:\n",
- " - collaborator1\n",
- " - collaborator2"
+ " - bob\n",
+ " - charlie"
]
},
{
diff --git a/linters-requirements.txt b/linters-requirements.txt
index 65b3ba5ebf..3df354dfda 100644
--- a/linters-requirements.txt
+++ b/linters-requirements.txt
@@ -1,2 +1,2 @@
pre-commit
-ruff==0.9.9
\ No newline at end of file
+ruff==0.11.2
\ No newline at end of file
diff --git a/openfl-workspace/flower-app-pytorch/README.md b/openfl-workspace/flower-app-pytorch/README.md
index fa108772ef..2b286d1eee 100644
--- a/openfl-workspace/flower-app-pytorch/README.md
+++ b/openfl-workspace/flower-app-pytorch/README.md
@@ -4,7 +4,7 @@ This workspace demonstrates a new functionality in OpenFL to interoperate with [
## Overview
-In this repository, you'll notice a directory under `src` called `app-pytorch`. This is essentially a Flower PyTorch app created using Flower's `flwr new` command that has been modified to run a local federation. The `client_app.py` and `server_app.py` dictate what will be run by the client and server respectively. `task.py` defines the logic that will be executed by each app, such as the model definition, train/test tasks, etc. Under `server_app.py` a section titled "Save Model" is added in order to save the `best.pbuf` and `last.pbuf` models from the experiment in your local workspace under `./save`. This uses native OpenFL logic to store the model as a `.pbuf` in order to later be retrieved by `fx model save` into a native format (limited to `.npz` to be deep learning framework agnostic), but this can be overridden to save the model directly following Flower's recommended method for [saving model checkpoints](https://flower.ai/docs/framework/how-to-save-and-load-model-checkpoints.html).
+In this repository, you'll notice a directory under `src` called `app-pytorch`. This is essentially a Flower PyTorch app created using Flower's `flwr new` command that has been modified to run a local federation. The `client_app.py` and `server_app.py` dictate what will be run by the client and server respectively. `task.py` defines the logic that will be executed by each app, such as the model definition, train/test tasks, etc. Under `server_app.py` a save model strategy is defined in order to save the best and last models from the experiment in your local workspace under `./save`.
## Getting Started
@@ -19,21 +19,58 @@ Start by creating a workspace:
```sh
fx workspace create --template flower-app-pytorch --prefix my_workspace
cd my_workspace
+pip install -r requirements.txt
```
-This will create a workspace in your current working directory called `./my_workspace` as well as install the Flower app defined in `./app-pytorch.` This will be where the experiment takes place.
+Then create a certificate authority (CA)
+
+```sh
+fx workspace certify
+```
+
+This will create a workspace in your current working directory called `./my_workspace` as well as install the Flower app defined in `./app-pytorch.` This will be where the experiment takes place. The CA will be used to sign the certificates of the collaborators.
+
+### Setup Data
+We will be using CIFAR10 dataset. You can install an automatically partition it into 2 using the `./src/setup_data.py` script provided.
+
+```sh
+python ./src/setup_data.py 2
+```
+
+This will download the data, partition it into 2 shards and store it under the `./data/1` and `./data/2`, respectively.
+
+```
+data/
+├── 1
+│ ├── test
+│ │ ├── 0
+│ │ ├── 1
+│ │ ├── 2
+│ │ ├── 3
+│ │ ├── 4
+│ │ ├── 5
+│ │ ├── 6
+│ │ ├── 7
+│ │ ├── 8
+│ │ └── 9
+│ └── train
+│ ├── ...
+└── 2
+ ├── ...
+```
### Configure the Experiment
Notice under `./plan`, you will find the familiar OpenFL YAML files to configure the experiment. `cols.yaml` and `data.yaml` will be populated by the collaborators that will run the Flower client app and the respective data shard or directory they will perform their training and testing on.
`plan.yaml` configures the experiment itself. The Open-Flower integration makes a few key changes to the `plan.yaml`:
-1. Introduction of a new top-level key (`connector`) to configure a newly introduced component called `ConnectorFlower`. This component is run by the aggregator and is responsible for initializing the Flower `SuperLink` and connecting to the OpenFL server. The `SuperLink` parameters can be configured using `connector.settings.superlink_params`. If nothing is supplied, it will simply run `flower-superlink --insecure` with the command's default settings as dictated by Flower. It also includes the option to run the flwr run command via `connector.settings.flwr_run_params`. If `flwr_run_params` are not provided, the user will be expected to run `flwr run
` from the aggregator machine to initiate the experiment.
+1. Introduction of a new top-level key (`connector`) to configure a newly introduced component called `ConnectorFlower`. This component is run by the aggregator and is responsible for initializing the Flower `SuperLink` and connecting to the OpenFL server. The `SuperLink` parameters can be configured using `connector.settings.superlink_params`. If nothing is supplied, it will simply run `flower-superlink --insecure` with the command's default settings as dictated by Flower. It also includes the option to run the flwr run command via `connector.settings.flwr_run_params`. If `flwr_run_params` are not provided, the user will be expected to run `flwr run ` from the aggregator machine to initiate the experiment. Additionally, the `ConnectorFlower` has an additional setting `connector.settings.automatic_shutdown` which is default set to `True`. When set to `True`, the task runner will shut the SuperNode at the completion of an experiment, otherwise, it will run continuously.
```yaml
connector:
defaults: plan/defaults/connector.yaml
template: openfl.component.ConnectorFlower
settings:
+ automatic_shutdown: True
superlink_params:
insecure: True
serverappio-api-address: 127.0.0.1:9091
@@ -44,20 +81,16 @@ connector:
federation_name: "local-poc"
```
-2. `FlowerTaskRunner` which will execute the `start_client_adapter` task. This task starts the Flower SuperNode and makes a connection to the OpenFL client. Additionally, the `FlowerTaskRunner` has an additional setting `FlowerTaskRunner.settings.auto_shutdown` which is default set to `True`. When set to `True`, the task runner will shut the SuperNode at the completion of an experiment, otherwise, it will run continuously.
+2. `FlowerTaskRunner` which will execute the `start_client_adapter` task. This task starts the Flower SuperNode and makes a connection to the OpenFL client.
```yaml
task_runner:
defaults: plan/defaults/task_runner.yaml
template: openfl.federated.task.runner_flower.FlowerTaskRunner
- settings:
- auto_shutdown: True
```
3. `FlowerDataLoader` with similar high-level functionality to other dataloaders.
-**IMPORTANT NOTE**: `aggregator.settings.rounds_to_train` is set to 1. __Do not edit this__. The actual number of rounds for the experiment is controlled by Flower logic inside of `./app-pytorch/pyproject.toml`. The entirety of the Flower experiment will run in a single OpenFL round. Increasing this will cause OpenFL to attempt to run the experiment again. The aggregator round is there to stop the OpenFL components at the completion of the experiment.
-
4. `Task` - we introduce a `tasks_connector.yaml` that will allow the collaborator to connect to Flower framework via the local gRPC server. It also handles the task runner's `start_client_adapter` method, which actually starts the Flower component and local gRPC server. By setting `local_server_port` to 0, the port is dynamically allocated. This is mainly for local experiments to avoid overlapping the ports.
```yaml
@@ -70,6 +103,12 @@ tasks:
local_server_port: 0
```
+> **Note**: `aggregator.settings.rounds_to_train` is set to 1. __Do not edit this__. The actual number of rounds for the experiment is controlled by Flower logic inside of `./app-pytorch/pyproject.toml`. The entirety of the Flower experiment will run in a single OpenFL round. Increasing this will cause OpenFL to attempt to run the experiment again. The aggregator round is there to stop the OpenFL components at the completion of the experiment.
+
+> **Note**: `aggregator.settings.write_logs` will be set to `False`. While setting it to `True` will not result in an error, OpenFL's aggregator will not capture the logs since logging is handled by Flower directly.
+
+> **Note**: This workspace does not currently support secure aggregation through OpenFL natively. Look into Flower's documentation to enable secure aggregation.
+
## Execution Methods
There are two ways to execute this:
@@ -78,17 +117,21 @@ There are two ways to execute this:
## Running the Workspace
We proceed with the automatic shutdown method of execution.
-Run the workspace as normal (certify the workspace, initialize the plan, register the collaborators, etc.):
+
+Initialize the plan.
+
+```SH
+fx plan initialize -a localhost
+```
+
+Run the workspace as normal (aggregator setup, collaborator setup, etc.):
```SH
# Generate a Certificate Signing Request (CSR) for the Aggregator
-fx aggregator generate-cert-request
+fx aggregator generate-cert-request --fqdn localhost
# The CA signs the aggregator's request, which is now available in the workspace
-fx aggregator certify --silent
-
-# Initialize FL Plan and Model Weights for the Federation
-fx plan initialize
+fx aggregator certify --fqdn localhost --silent
################################
# Setup Collaborator 1
@@ -115,12 +158,11 @@ fx collaborator generate-cert-request -n collaborator2
# The CA signs collaborator2's certificate
fx collaborator certify -n collaborator2 --silent
+```
-##############################
-# Start to Run the Federation
-##############################
+Start the aggregator
-# Run the Aggregator
+```SH
fx aggregator start
```
@@ -241,13 +283,14 @@ It will run another experiment. Once you are done, you can manually shut down Op
### Running in SGX Enclave
Gramine does not support all Linux system calls. Flower FAB is built and installed at runtime. During this, `utime()` is called, which is an [unsupported call](https://gramine.readthedocs.io/en/latest/devel/features.html#list-of-system-calls), resulting in error or unexpected behavior. To navigate this, when running in an SGX enclave, we opt to build and install the FAB during initialization and package it alongside the OpenFL workspace. To make this work, we introduce some patches to Flower's build command, which helps circumvent the unsupported system call as well as minimize read/write access.
-To run these patches, simply add `patch: True` to the `Connector` and `Task Runner` settings. For the `Task Runner` also include the name of the Flower app for building and installation.
+To run these patches, simply add `patch: True` to the `Connector` and `Task Runner` settings (if not already set). For the `Task Runner` also include the name of the Flower app for building and installation.
```yaml
connector :
defaults : plan/defaults/connector.yaml
template : openfl.component.ConnectorFlower
settings :
+ automatic_shutdown : True
superlink_params :
insecure : True
serverappio-api-address : 127.0.0.1:9091
diff --git a/openfl-workspace/flower-app-pytorch/plan/plan.yaml b/openfl-workspace/flower-app-pytorch/plan/plan.yaml
index 1cb55eca51..b7ad7167e0 100644
--- a/openfl-workspace/flower-app-pytorch/plan/plan.yaml
+++ b/openfl-workspace/flower-app-pytorch/plan/plan.yaml
@@ -13,6 +13,7 @@ connector :
defaults : plan/defaults/connector.yaml
template : src.connector_flower.ConnectorFlower
settings :
+ automatic_shutdown : True
superlink_params :
insecure : True
serverappio-api-address : 127.0.0.1:9091
diff --git a/openfl-workspace/flower-app-pytorch/src/app-pytorch/pyproject.toml b/openfl-workspace/flower-app-pytorch/src/app-pytorch/pyproject.toml
index 39e88d12a1..dcf1febcd8 100644
--- a/openfl-workspace/flower-app-pytorch/src/app-pytorch/pyproject.toml
+++ b/openfl-workspace/flower-app-pytorch/src/app-pytorch/pyproject.toml
@@ -8,7 +8,7 @@ version = "1.0.0"
description = ""
license = "Apache-2.0"
dependencies = [
- "flwr>=1.15.0",
+ "flwr>=1.15.0,<1.17.0",
"flwr-datasets[vision]>=0.5.0",
"torch==2.5.1",
"torchvision==0.20.1",
diff --git a/openfl-workspace/flower-app-pytorch/src/connector_flower.py b/openfl-workspace/flower-app-pytorch/src/connector_flower.py
index 42e1377be7..bfba10258f 100644
--- a/openfl-workspace/flower-app-pytorch/src/connector_flower.py
+++ b/openfl-workspace/flower-app-pytorch/src/connector_flower.py
@@ -7,9 +7,15 @@
import signal
from src.grpc.connector.flower.interop_client import FlowerInteropClient
+from src.util import is_safe_path
import os
-os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "save/.flwr")
+
+flwr_home = os.path.join(os.getcwd(), "save/.flwr")
+if not is_safe_path(flwr_home):
+ raise ValueError("Invalid path for FLWR_HOME")
+
+os.environ["FLWR_HOME"] = flwr_home
os.makedirs(os.environ["FLWR_HOME"], exist_ok=True)
class ConnectorFlower:
diff --git a/openfl-workspace/flower-app-pytorch/src/loader.py b/openfl-workspace/flower-app-pytorch/src/loader.py
index e741aa44e9..0b63f60af0 100644
--- a/openfl-workspace/flower-app-pytorch/src/loader.py
+++ b/openfl-workspace/flower-app-pytorch/src/loader.py
@@ -4,6 +4,7 @@
"""FlowerDataLoader module."""
from openfl.federated.data.loader import DataLoader
+import os
class FlowerDataLoader(DataLoader):
@@ -21,8 +22,14 @@ def __init__(self, data_path, **kwargs):
Args:
data_path (str or int): The directory of the dataset.
**kwargs: Additional keyword arguments to pass to the parent DataLoader class.
+
+ Raises:
+ FileNotFoundError: If the specified data path does not exist.
"""
super().__init__(**kwargs)
+ if not os.path.exists(data_path):
+ raise FileNotFoundError(f"The specified data path does not exist: {data_path}")
+
self.data_path = data_path
def get_node_configs(self):
diff --git a/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py
index b42ce44d6a..d8411cbd4d 100644
--- a/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py
+++ b/openfl-workspace/flower-app-pytorch/src/patch/patch_flwr_build.py
@@ -12,6 +12,7 @@
import tomli_w
import hashlib
import os
+from src.util import is_safe_path
def build(
app: Annotated[
@@ -131,6 +132,12 @@ def build(
### PATCH ###
# # REASONING: original code writes to /tmp/ by default. Writing to flwr_home allows us to consolidate written files
+ if not os.path.isdir(flwr_home):
+ raise ValueError("Invalid directory")
+
+ if not is_safe_path(fab_filename):
+ raise ValueError("Invalid filename")
+
final_path = os.path.join(flwr_home, fab_filename)
shutil.move(temp_filename, final_path)
#################################
@@ -144,5 +151,4 @@ def build(
return final_path, fab_hash
################
-
flwr.cli.build.build = build
diff --git a/openfl-workspace/flower-app-pytorch/src/runner.py b/openfl-workspace/flower-app-pytorch/src/runner.py
index 3b80f6cb43..a456bbe942 100644
--- a/openfl-workspace/flower-app-pytorch/src/runner.py
+++ b/openfl-workspace/flower-app-pytorch/src/runner.py
@@ -7,8 +7,13 @@
from pathlib import Path
import sys
import socket
+from src.util import is_safe_path
-os.environ["FLWR_HOME"] = os.path.join(os.getcwd(), "save/.flwr")
+flwr_home = os.path.join(os.getcwd(), "save/.flwr")
+if not is_safe_path(flwr_home):
+ raise ValueError("Invalid path for FLWR_HOME")
+
+os.environ["FLWR_HOME"] = flwr_home
os.makedirs(os.environ["FLWR_HOME"], exist_ok=True)
class FlowerTaskRunner(TaskRunner):
@@ -177,7 +182,7 @@ def get_dynamic_port():
# Create a socket
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
# Bind to port 0 to let the OS assign an available port
- s.bind(('', 0))
+ s.bind(('127.0.0.1', 0))
# Get the assigned port number
port = s.getsockname()[1]
return port
diff --git a/openfl-workspace/flower-app-pytorch/src/util.py b/openfl-workspace/flower-app-pytorch/src/util.py
new file mode 100644
index 0000000000..750eff8f2a
--- /dev/null
+++ b/openfl-workspace/flower-app-pytorch/src/util.py
@@ -0,0 +1,13 @@
+import re
+
+def is_safe_path(path):
+ """
+ Validate the path to ensure it contains only allowed characters.
+
+ Args:
+ path (str): The path to validate.
+
+ Returns:
+ bool: True if the path is safe, False otherwise.
+ """
+ return re.match(r'^[\w\-/\.]+$', path) is not None
diff --git a/openfl-workspace/keras/hippmapp3r/.workspace b/openfl-workspace/keras/hippmapp3r/.workspace
new file mode 100644
index 0000000000..3c2c5d08b4
--- /dev/null
+++ b/openfl-workspace/keras/hippmapp3r/.workspace
@@ -0,0 +1,2 @@
+current_plan_name: default
+
diff --git a/openfl-workspace/keras/hippmapp3r/README.md b/openfl-workspace/keras/hippmapp3r/README.md
new file mode 100644
index 0000000000..26ef206136
--- /dev/null
+++ b/openfl-workspace/keras/hippmapp3r/README.md
@@ -0,0 +1,14 @@
+# Implementation of Hipp Mapper for Medical Imaging
+
+This project implements a hippocampal mapper for medical imaging. It works similarly to other workspaces as described in the [OpenFL Taskrunner Tutorial](https://openfl.readthedocs.io/en/latest/tutorials/taskrunner.html).
+
+## Setup Instructions
+
+To set up the data, run the following command:
+```bash
+python src/setup_data.py --num_collaborators $MAX_NUMBER_OF_COLLABORATORS --total_dataset_size_per_col_MB $DESIRED_DATASET_SIZE
+```
+
+## Reference
+
+Goubran, M., Ntiri E., Akhavein, H., Holmes, M., Nestor, S., Ramirez, J., Adamo, S., Gao, F., Ozzoude, M., Scott, C., Martel, A., Swardfager, W., Masellis, M., Swartz, R., MacIntosh B, and Black, SE. “Hippocampal segmentation for atrophied brains using three-dimensional convolutional neural networks”. Human Brain Mapping. 2020 Feb 1;41(2):291-308. https://onlinelibrary.wiley.com/doi/full/10.1002/hbm.24811
\ No newline at end of file
diff --git a/openfl-workspace/keras/hippmapp3r/plan/cols.yaml b/openfl-workspace/keras/hippmapp3r/plan/cols.yaml
new file mode 100644
index 0000000000..285a69c6ff
--- /dev/null
+++ b/openfl-workspace/keras/hippmapp3r/plan/cols.yaml
@@ -0,0 +1,4 @@
+# Copyright (C) 2020-2025 Intel Corporation
+# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you.
+
+collaborators:
diff --git a/openfl-workspace/keras/hippmapp3r/plan/data.yaml b/openfl-workspace/keras/hippmapp3r/plan/data.yaml
new file mode 100644
index 0000000000..4dc0b4920d
--- /dev/null
+++ b/openfl-workspace/keras/hippmapp3r/plan/data.yaml
@@ -0,0 +1,6 @@
+# Copyright (C) 2020-2025 Intel Corporation
+# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you.
+
+# collaborator_name,data_directory_path
+collaborator1,data/1
+collaborator2,data/2
\ No newline at end of file
diff --git a/openfl-workspace/keras/hippmapp3r/plan/defaults b/openfl-workspace/keras/hippmapp3r/plan/defaults
new file mode 100644
index 0000000000..fb82f9c5b6
--- /dev/null
+++ b/openfl-workspace/keras/hippmapp3r/plan/defaults
@@ -0,0 +1,2 @@
+../../workspace/plan/defaults
+
diff --git a/openfl-workspace/keras/hippmapp3r/plan/plan.yaml b/openfl-workspace/keras/hippmapp3r/plan/plan.yaml
new file mode 100644
index 0000000000..0d9acda7be
--- /dev/null
+++ b/openfl-workspace/keras/hippmapp3r/plan/plan.yaml
@@ -0,0 +1,65 @@
+# Copyright (C) 2020-2025 Intel Corporation
+# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you.
+
+aggregator :
+ defaults : plan/defaults/aggregator.yaml
+ template : openfl.component.Aggregator
+ settings :
+ init_state_path : save/init.pbuf
+ best_state_path : save/best.pbuf
+ last_state_path : save/last.pbuf
+ rounds_to_train : 10
+
+collaborator :
+ defaults : plan/defaults/collaborator.yaml
+ template : openfl.component.Collaborator
+ settings :
+ use_delta_updates : false
+ opt_treatment : RESET
+
+data_loader :
+ defaults : plan/defaults/data_loader.yaml
+ template : src.dataloader.KerasHippmapp3rsynth
+ settings :
+ batch_size : 1
+
+task_runner :
+ defaults : plan/defaults/task_runner.yaml
+ template : src.taskrunner.KerasHippmapp3r
+
+network :
+ defaults : plan/defaults/network.yaml
+ settings :
+ agg_port : 54678
+
+assigner :
+ defaults : plan/defaults/assigner.yaml
+
+tasks :
+ defaults : plan/defaults/tasks_keras.yaml
+ aggregated_model_validation:
+ function : validate_task
+ kwargs :
+ batch_size : 1
+ apply : global
+ metrics :
+ - dice_coefficient
+
+ locally_tuned_model_validation:
+ function : validate_task
+ kwargs :
+ batch_size : 1
+ apply : local
+ metrics :
+ - dice_coefficient
+
+ train:
+ function : train_task
+ kwargs :
+ batch_size : 1
+ epochs : 1
+ metrics :
+ - loss
+
+compression_pipeline :
+ defaults : plan/defaults/compression_pipeline.yaml
diff --git a/openfl-workspace/keras/hippmapp3r/requirements.txt b/openfl-workspace/keras/hippmapp3r/requirements.txt
new file mode 100644
index 0000000000..1970de95d4
--- /dev/null
+++ b/openfl-workspace/keras/hippmapp3r/requirements.txt
@@ -0,0 +1,5 @@
+keras==2.15.0
+tensorflow
+keras-contrib @ git+https://www.github.com/keras-team/keras-contrib.git
+gdown
+scikit-learn
\ No newline at end of file
diff --git a/openfl-workspace/keras/hippmapp3r/src/__init__.py b/openfl-workspace/keras/hippmapp3r/src/__init__.py
new file mode 100644
index 0000000000..035ee4d0ae
--- /dev/null
+++ b/openfl-workspace/keras/hippmapp3r/src/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (C) 2020-2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+"""You may copy this file as the starting point of your own model."""
diff --git a/openfl-workspace/keras/hippmapp3r/src/dataloader.py b/openfl-workspace/keras/hippmapp3r/src/dataloader.py
new file mode 100644
index 0000000000..7e8664ebc2
--- /dev/null
+++ b/openfl-workspace/keras/hippmapp3r/src/dataloader.py
@@ -0,0 +1,59 @@
+# Copyright (C) 2020-2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""You may copy this file as the starting point of your own model."""
+
+from openfl.federated import KerasDataLoader
+import numpy as np
+from sklearn.model_selection import train_test_split
+from glob import glob
+
+
+class KerasHippmapp3rsynth(KerasDataLoader):
+ """Data Loader for synthetic Hippmapp3r Dataset."""
+
+ def __init__(self, data_path, batch_size, **kwargs):
+ """
+ Initialize.
+
+ Args:
+ data_path: File path for the dataset
+ batch_size (int): The batch size for the data loader
+ **kwargs: Additional arguments, passed to super init and load_mnist_shard
+ """
+ super().__init__(batch_size, **kwargs)
+
+ X_train = glob(f"{data_path}/X*.npy")
+ y_train = glob(f"{data_path}/y*.npy")
+ # Perform test-train split
+ X_train, X_valid, y_train, y_valid = train_test_split(
+ X_train, y_train, test_size=0.2, random_state=42
+ )
+ self.X_train = np.asarray(X_train)
+ self.X_valid = np.asarray(X_valid)
+ self.y_train = np.asarray(y_train)
+ self.y_valid = np.asarray(y_valid)
+
+ @staticmethod
+ def _batch_generator(X, y, idxs, batch_size, num_batches):
+ """Generates batches of data.
+
+ Args:
+ X (np.array): The input data.
+ y (np.array): The label data.
+ idxs (np.array): The index of the dataset.
+ batch_size (int): The batch size for the data loader.
+ num_batches (int): The number of batches.
+
+ Yields:
+ tuple: The input data and label data for each batch.
+ """
+ for i in range(num_batches):
+ a = i * batch_size
+ b = a + batch_size
+ x_list = []
+ y_list = []
+ for _x, _y in zip(X[idxs[a:b]], y[idxs[a:b]]):
+ x_list.append(np.load(_x))
+ y_list.append(np.load(_y))
+ yield np.stack(x_list), np.stack(y_list)
diff --git a/openfl-workspace/keras/hippmapp3r/src/setup_data.py b/openfl-workspace/keras/hippmapp3r/src/setup_data.py
new file mode 100644
index 0000000000..c160b3c2c5
--- /dev/null
+++ b/openfl-workspace/keras/hippmapp3r/src/setup_data.py
@@ -0,0 +1,47 @@
+# Copyright (C) 2020-2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+"""You may copy this file as the starting point of your own model."""
+import os
+import numpy as np
+from tqdm import tqdm
+import argparse
+
+
+def create_data(collaborators, dst, total_dataset_size_per_col):
+ num_samples = 1
+ input_shape = (160, 160, 128)
+ X_train = np.random.random((num_samples,) + input_shape)
+ y_train = np.random.random((num_samples,) + input_shape)
+
+ # Save the arrays
+ os.makedirs(dst + "/1", exist_ok=True)
+ np.save(dst + "/1/X_train.npy", X_train)
+ np.save(dst + "/1/y_train.npy", y_train)
+ single_file_size_MB = os.path.getsize(dst + "/1/X_train.npy") / 1024 / 1024
+ single_file_size_MB += os.path.getsize(dst + "/1/y_train.npy") / 1024 / 1024
+ required_file_size_MB = total_dataset_size_per_col * collaborators
+ files_to_make = required_file_size_MB // single_file_size_MB
+ files_per_collaborator = int(files_to_make // collaborators)
+ for col in range(1, collaborators + 1):
+ print("making files for collaborator", col)
+ os.makedirs(dst + f"/{col}", exist_ok=True)
+ for i in tqdm(range(files_per_collaborator)):
+ X_train = np.random.random((num_samples,) + input_shape)
+ y_train = np.random.random((num_samples,) + input_shape)
+ np.save(dst + f"/{col}/X_train_{i}.npy", X_train)
+ np.save(dst + f"/{col}/y_train_{i}.npy", y_train)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--num_collaborators", type=int, help="Number of collaborators")
+ parser.add_argument(
+ "--total_dataset_size_per_col_MB", type=int, help="Total dataset size per collaborator"
+ )
+ args = parser.parse_args()
+
+ num_collaborators = args.num_collaborators
+ total_dataset_size_per_col = args.total_dataset_size_per_col_MB
+
+ dst = "data"
+ create_data(num_collaborators, dst, total_dataset_size_per_col)
diff --git a/openfl-workspace/keras/hippmapp3r/src/taskrunner.py b/openfl-workspace/keras/hippmapp3r/src/taskrunner.py
new file mode 100644
index 0000000000..1fe60e0797
--- /dev/null
+++ b/openfl-workspace/keras/hippmapp3r/src/taskrunner.py
@@ -0,0 +1,87 @@
+# Copyright (C) 2020-2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""You may copy this file as the starting point of your own model."""
+
+from keras.models import model_from_json
+from keras_contrib.layers import InstanceNormalization
+
+from openfl.federated import KerasTaskRunner
+from keras import backend as K
+import gdown
+import hashlib
+
+MODEL_JSON_HASH = "c35cfa990000ad87825f182460395ec5d1437a707bd33de4df55d45664e94214"
+MODEL_WEIGHTS_HASH = "cd5e52d42e2c6d737e370fb0e673aec5d257134e127c0e59478f11676fa327a5"
+
+
+def dice_coefficient(y_true, y_pred, smooth=1.0):
+ y_true_f = K.flatten(y_true)
+ y_pred_f = K.flatten(y_pred)
+ intersection = K.sum(y_true_f * y_pred_f)
+ return (2.0 * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
+
+
+def dice_coefficient_loss(y_true, y_pred):
+ return -dice_coefficient(y_true, y_pred)
+
+
+def compute_file_hash(file_path):
+ with open(file_path, "rb") as file:
+ data = file.read()
+ file_hash = hashlib.sha256(data).hexdigest()
+ return file_hash
+
+
+class KerasHippmapp3r(KerasTaskRunner):
+ """A basic convolutional neural network model."""
+
+ def __init__(self, **kwargs):
+ """
+ Initialize.
+
+ Args:
+ **kwargs: Additional parameters to pass to the function
+ """
+ super().__init__(**kwargs)
+
+ weights_id = "1_VEOScLGyr1qV-t-zggq8Lxwgf_z-IpQ"
+ gdown.download(id=weights_id, output="model.h5")
+ json_id = "1RUE3Cw_rpKnKfwlu75kLbkcr9hde9nV4"
+ gdown.download(id=json_id, output="model.json")
+
+ model_json_hash = compute_file_hash("model.json")
+ if model_json_hash != MODEL_JSON_HASH:
+ raise ValueError("Model JSON file hash does not match expected value.")
+
+ model_weights_hash = compute_file_hash("model.h5")
+ if model_weights_hash != MODEL_WEIGHTS_HASH:
+ raise ValueError("Model weights file hash does not match expected value.")
+
+ self.model = self.build_model(model_json="model.json", model_weights="model.h5", **kwargs)
+
+ self.initialize_tensorkeys_for_functions()
+
+ def build_model(self, model_json, model_weights, **kwargs):
+ """
+ Define the model architecture.
+
+ Args:
+ model_json (str): Path to model json config
+ model_weights (str): Path to model weights
+
+ Returns:
+ keras.models.Sequential: The model defined in Keras
+
+ """
+ custom_objects = {}
+ custom_objects["InstanceNormalization"] = InstanceNormalization
+ json_file = open(model_json, "r")
+ loaded_model_json = json_file.read()
+ json_file.close()
+ model = model_from_json(loaded_model_json, custom_objects=custom_objects)
+ model.load_weights(model_weights)
+
+ model.compile(loss=dice_coefficient_loss, optimizer="adam", metrics=[dice_coefficient])
+
+ return model
diff --git a/openfl-workspace/torch/mnist/plan/cols.yaml b/openfl-workspace/torch/mnist/plan/cols.yaml
index b60b50e5a8..2f4993ebbb 100644
--- a/openfl-workspace/torch/mnist/plan/cols.yaml
+++ b/openfl-workspace/torch/mnist/plan/cols.yaml
@@ -2,5 +2,3 @@
# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you.
collaborators:
-- collaborator1
-- collaborator2
\ No newline at end of file
diff --git a/openfl/__version__.py b/openfl/__version__.py
index 362d032f0d..9caed07178 100644
--- a/openfl/__version__.py
+++ b/openfl/__version__.py
@@ -4,4 +4,4 @@
"""openfl version information."""
-__version__ = "1.8.0.dev0"
+__version__ = "1.9.0.dev"
diff --git a/openfl/callbacks/metric_writer.py b/openfl/callbacks/metric_writer.py
index fc9a2daa35..c5a7cdb8d3 100644
--- a/openfl/callbacks/metric_writer.py
+++ b/openfl/callbacks/metric_writer.py
@@ -3,6 +3,7 @@
import json
import logging
import os
+import time
from tensorboardX import SummaryWriter
@@ -26,6 +27,7 @@ def __init__(self, log_dir: str = "./logs/", use_tensorboard: bool = True):
self._log_file_handle = None
self._summary_writer = None
+ self._round_start_time = None
def on_experiment_begin(self, logs=None):
"""Open file handles for logging."""
@@ -49,13 +51,19 @@ def on_round_end(self, round_num: int, logs=None):
logs: A key-value pair of scalar metrics.
"""
logs = logs or {}
- logger.info(f"Round {round_num}: Metrics: {logs}")
-
- self._log_file_handle.write(json.dumps(logs) + "\n")
+ elapsed_seconds = time.monotonic() - self._round_start_time
+ metrics = {
+ "round_number": round_num,
+ "elapsed_seconds": elapsed_seconds,
+ **logs,
+ }
+ logger.info(f"Round {round_num}: Metrics: {metrics}")
+
+ self._log_file_handle.write(json.dumps(metrics) + "\n")
self._log_file_handle.flush()
if self._summary_writer:
- for key, value in logs.items():
+ for key, value in metrics.items():
self._summary_writer.add_scalar(key, value, round_num)
self._summary_writer.flush()
@@ -67,3 +75,6 @@ def on_experiment_end(self, logs=None):
if self._summary_writer:
self._summary_writer.close()
+
+ def on_round_begin(self, round_num: int, logs=None):
+ self._round_start_time = time.monotonic()
\ No newline at end of file
diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py
index b2e9676184..547909fb80 100644
--- a/openfl/component/aggregator/aggregator.py
+++ b/openfl/component/aggregator/aggregator.py
@@ -17,7 +17,7 @@
from openfl.pipelines import NoCompressionPipeline, TensorCodec
from openfl.protocols import base_pb2, utils
from openfl.protocols.base_pb2 import NamedTensor
-from openfl.utilities import TaskResultKey, TensorKey, change_tags
+from openfl.utilities import TaskResultKey, TensorKey, apply_delta, change_tags, generate_delta
logger = logging.getLogger(__name__)
@@ -168,9 +168,6 @@ def __init__(
self.best_state_path = best_state_path
self.last_state_path = last_state_path
- # TODO: Remove. Used in deprecated interactive and native APIs
- self.best_tensor_dict: dict = {}
- self.last_tensor_dict: dict = {}
# these enable getting all tensors for a task
self.collaborator_tasks_results = {} # {TaskResultKey: list of TensorKeys}
self.collaborator_task_weight = {} # {TaskResultKey: data_size}
@@ -192,8 +189,6 @@ def __init__(
origin="aggregator",
)
- self.collaborator_tensor_results = {} # {TensorKey: nparray}}
-
if initial_tensor_dict:
self._load_initial_tensors_from_dict(initial_tensor_dict)
self.model = utils.construct_model_proto(
@@ -549,7 +544,6 @@ def _straggler_cutoff_time_elapsed(self) -> None:
def get_aggregated_tensor(
self,
- collaborator_name,
tensor_name,
round_number,
report,
@@ -563,7 +557,6 @@ def get_aggregated_tensor(
that matches the request.
Args:
- collaborator_name (str): Requested tensor key collaborator name.
tensor_name (str): Name of the tensor.
round_number (int): Actual round number.
report (bool): Whether to report.
@@ -576,11 +569,6 @@ def get_aggregated_tensor(
Raises:
ValueError: if Aggregator does not have an aggregated tensor for {tensor_key}.
"""
- logger.debug(
- f"Retrieving aggregated tensor {tensor_name},{round_number},{tags} "
- f"for collaborator {collaborator_name}"
- )
-
if "compressed" in tags or require_lossless:
compress_lossless = True
else:
@@ -672,9 +660,7 @@ def default(self, obj):
"The original model layer should be present if the latest "
"aggregated model is present"
)
- delta_tensor_key, delta_nparray = self.tensor_codec.generate_delta(
- tensor_key, nparray, model_nparray
- )
+ delta_tensor_key, delta_nparray = generate_delta(tensor_key, nparray, model_nparray)
delta_comp_tensor_key, delta_comp_nparray, metadata = self.tensor_codec.compress(
delta_tensor_key, delta_nparray, lossless=compress_lossless
)
@@ -763,6 +749,7 @@ def send_local_task_results(
f"Collaborator {collaborator_name} is sending task results "
f"for {task_name}, round {round_number}"
)
+
self.process_task_results(
collaborator_name, round_number, task_name, data_size, named_tensors
)
@@ -830,10 +817,9 @@ def process_task_results(
self.collaborator_tasks_results[task_key] = task_results
- with self.lock:
- self._is_collaborator_done(collaborator_name, round_number)
-
- self._end_of_round_with_stragglers_check()
+ # Check if collaborator or round is done.
+ self._is_collaborator_done(collaborator_name, round_number)
+ self._end_of_round_with_stragglers_check()
def _end_of_round_with_stragglers_check(self):
"""
@@ -931,7 +917,7 @@ def _process_named_tensor(self, named_tensor, collaborator_name):
base_model_nparray = self.tensor_db.get_tensor_from_cache(base_model_tensor_key)
if base_model_nparray is None:
raise ValueError(f"Base model {base_model_tensor_key} not present in TensorDB")
- final_tensor_key, final_nparray = self.tensor_codec.apply_delta(
+ final_tensor_key, final_nparray = apply_delta(
decompressed_tensor_key,
decompressed_nparray,
base_model_nparray,
@@ -971,9 +957,7 @@ def _prepare_trained(self, tensor_name, origin, round_number, report, agg_result
base_model_tk = TensorKey(tensor_name, origin, round_number, report, ("model",))
base_model_nparray = self.tensor_db.get_tensor_from_cache(base_model_tk)
if base_model_nparray is not None and self.use_delta_updates:
- delta_tk, delta_nparray = self.tensor_codec.generate_delta(
- agg_tag_tk, agg_results, base_model_nparray
- )
+ delta_tk, delta_nparray = generate_delta(agg_tag_tk, agg_results, base_model_nparray)
else:
# This condition is possible for base model
# optimizer states (i.e. Adam/iter:0, SGD, etc.)
@@ -1001,7 +985,7 @@ def _prepare_trained(self, tensor_name, origin, round_number, report, agg_result
# Apply delta (unless delta couldn't be created)
if base_model_nparray is not None and self.use_delta_updates:
logger.debug("Applying delta for layer %s", decompressed_delta_tk[0])
- new_model_tk, new_model_nparray = self.tensor_codec.apply_delta(
+ new_model_tk, new_model_nparray = apply_delta(
decompressed_delta_tk,
decompressed_delta_nparray,
base_model_nparray,
@@ -1162,6 +1146,8 @@ def _end_of_round_check(self):
self.stragglers = []
# resetting collaborators_done for next round
self.collaborators_done = []
+ self.collaborator_tasks_results = {}
+ self.collaborator_task_weight = {}
# TODO This needs to be fixed!
if self._time_to_quit():
diff --git a/openfl/component/assigner/static_grouped_assigner.py b/openfl/component/assigner/static_grouped_assigner.py
index 810e0ce352..c0e0d1ffa9 100644
--- a/openfl/component/assigner/static_grouped_assigner.py
+++ b/openfl/component/assigner/static_grouped_assigner.py
@@ -83,7 +83,7 @@ def define_task_assignments(self):
for col in group_col_list:
# For now, we assume that collaborators have the same tasks for
# every round
- self.collaborator_tasks[col] = {i: group["tasks"] for i in range(self.rounds)}
+ self.collaborator_tasks[col] = dict.fromkeys(range(self.rounds), group["tasks"])
# Now populate reverse lookup of tasks->group
for task in group["tasks"]:
for round_ in range(self.rounds):
diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py
index dd3e237eb6..06788a1555 100644
--- a/openfl/component/collaborator/collaborator.py
+++ b/openfl/component/collaborator/collaborator.py
@@ -11,10 +11,9 @@
import openfl.callbacks as callbacks_module
from openfl.databases import TensorDB
-from openfl.pipelines import NoCompressionPipeline, TensorCodec
-from openfl.protocols import utils
from openfl.transport.grpc.aggregator_client import AggregatorGRPCClient
-from openfl.utilities import TensorKey
+from openfl.transport.serialiser.collaborator import CollaboratorSerialiser
+from openfl.utilities import TensorKey, apply_delta, generate_delta
logger = logging.getLogger(__name__)
@@ -69,12 +68,12 @@ def __init__(
opt_treatment="RESET",
device_assignment_policy="CPU_ONLY",
use_delta_updates=False,
- compression_pipeline=None,
db_store_rounds=1,
log_memory_usage=False,
write_logs=False,
callbacks: Optional[List] = [],
secure_aggregation=False,
+ serialisation_middleware: CollaboratorSerialiser = None,
):
"""Initialize the Collaborator object.
@@ -105,8 +104,10 @@ def __init__(
self.aggregator_uuid = aggregator_uuid
self.federation_uuid = federation_uuid
- self.compression_pipeline = compression_pipeline or NoCompressionPipeline()
- self.tensor_codec = TensorCodec(self.compression_pipeline)
+ # This serialisation middleware is used to convert tensors to protobuf and vice versa when
+ # communicating with the aggregator.
+ self._serialisation_middleware = serialisation_middleware
+
self.tensor_db = TensorDB()
self.db_store_rounds = db_store_rounds
@@ -159,7 +160,7 @@ def run(self):
self.callbacks.on_experiment_begin()
while True:
- tasks, round_num, sleep_time, time_to_quit = self.client.get_tasks()
+ tasks, round_num, sleep_time, time_to_quit = self._serialisation_middleware.get_tasks()
if time_to_quit:
break
@@ -300,9 +301,7 @@ def get_data_for_tensorkey(self, tensor_key):
# Determine whether there are additional compression related
# dependencies.
# Typically, dependencies are only relevant to model layers
- tensor_dependencies = self.tensor_codec.find_dependencies(
- tensor_key, self.use_delta_updates
- )
+ tensor_dependencies = self._find_dependencies(tensor_key)
logger.debug(
"Unable to get tensor from local store..."
"attempting to retrieve from client len tensor_dependencies"
@@ -319,7 +318,7 @@ def get_data_for_tensorkey(self, tensor_key):
uncompressed_delta = self.get_aggregated_tensor_from_aggregator(
tensor_dependencies[1]
)
- new_model_tk, nparray = self.tensor_codec.apply_delta(
+ new_model_tk, nparray = apply_delta(
tensor_dependencies[1],
uncompressed_delta,
prior_model_layer,
@@ -379,18 +378,13 @@ def get_aggregated_tensor_from_aggregator(self, tensor_key, require_lossless=Fal
tensor_name, origin, round_number, report, tags = tensor_key
logger.debug("Requesting aggregated tensor %s", tensor_key)
- tensor = self.client.get_aggregated_tensor(
+ tensor_key, nparray = self._serialisation_middleware.get_aggregated_tensor(
tensor_name,
round_number,
report,
tags,
require_lossless,
)
-
- # this translates to a numpy array and includes decompression, as
- # necessary
- nparray = self.named_tensor_to_nparray(tensor)
-
# cache this tensor
self.tensor_db.cache_tensor({tensor_key: nparray})
@@ -407,8 +401,6 @@ def send_task_results(self, tensor_dict, round_number, task_name) -> dict:
Returns:
A dictionary of reportable metrics of the current collaborator for the task.
"""
- named_tensors = [self.nparray_to_named_tensor(k, v) for k, v in tensor_dict.items()]
-
# for general tasks, there may be no notion of data size to send.
# But that raises the question how to properly aggregate results.
@@ -423,128 +415,43 @@ def send_task_results(self, tensor_dict, round_number, task_name) -> dict:
logger.debug("%s data size = %s", task_name, data_size)
metrics = {}
+ tensor_dict_copy = {}
for tensor in tensor_dict:
- tensor_name, origin, fl_round, report, tags = tensor
+ tensor_name, origin, round_number, report, tags = tensor
+ if "trained" in tags and self.use_delta_updates:
+ # Should get the pretrained model to create the delta. If training
+ # has happened,
+ # Model should already be stored in the TensorDB
+ model_nparray = self.tensor_db.get_tensor_from_cache(
+ TensorKey(tensor_name, origin, round_number, report, ("model",))
+ )
+
+ # The original model will not be present for the optimizer on the
+ # first round.
+ if model_nparray is not None:
+ tensor, nparray = generate_delta(tensor, tensor_dict[tensor], model_nparray)
+ # Second element of value indicates whether a lossless transofmation is
+ # required.
+ tensor_dict_copy[tensor] = (nparray, False)
if report:
# Reportable metric must be a scalar
value = float(tensor_dict[tensor])
metrics.update({f"{self.collaborator_name}/{task_name}/{tensor_name}": value})
- self.client.send_local_task_results(
+ # For all other elements, we assume a lossless transformation is required.
+ if tensor not in tensor_dict_copy:
+ tensor_dict_copy[tensor] = (tensor_dict[tensor], True)
+
+ self._serialisation_middleware.send_local_task_results(
round_number,
task_name,
data_size,
- named_tensors,
+ tensor_dict_copy,
)
return metrics
- def nparray_to_named_tensor(self, tensor_key, nparray):
- """Construct the NamedTensor Protobuf.
-
- Includes logic to create delta, compress tensors with the TensorCodec,
- etc.
-
- Args:
- tensor_key (namedtuple): Tensorkey that will be resolved locally or
- remotely. May be the product of other tensors.
- nparray: The decompressed tensor associated with the requested
- tensor key.
-
- Returns:
- named_tensor (protobuf) : The tensor constructed from the nparray.
- """
- # if we have an aggregated tensor, we can make a delta
- tensor_name, origin, round_number, report, tags = tensor_key
- if "trained" in tags and self.use_delta_updates:
- # Should get the pretrained model to create the delta. If training
- # has happened,
- # Model should already be stored in the TensorDB
- model_nparray = self.tensor_db.get_tensor_from_cache(
- TensorKey(tensor_name, origin, round_number, report, ("model",))
- )
-
- # The original model will not be present for the optimizer on the
- # first round.
- if model_nparray is not None:
- delta_tensor_key, delta_nparray = self.tensor_codec.generate_delta(
- tensor_key, nparray, model_nparray
- )
- delta_comp_tensor_key, delta_comp_nparray, metadata = self.tensor_codec.compress(
- delta_tensor_key, delta_nparray
- )
-
- named_tensor = utils.construct_named_tensor(
- delta_comp_tensor_key,
- delta_comp_nparray,
- metadata,
- lossless=False,
- )
- return named_tensor
-
- # Assume every other tensor requires lossless compression
- compressed_tensor_key, compressed_nparray, metadata = self.tensor_codec.compress(
- tensor_key, nparray, require_lossless=True
- )
- named_tensor = utils.construct_named_tensor(
- compressed_tensor_key, compressed_nparray, metadata, lossless=True
- )
-
- return named_tensor
-
- def named_tensor_to_nparray(self, named_tensor):
- """Convert named tensor to a numpy array.
-
- Args:
- named_tensor (protobuf): The tensor to convert to nparray.
-
- Returns:
- decompressed_nparray (nparray): The nparray converted.
- """
- # do the stuff we do now for decompression and frombuffer and stuff
- # This should probably be moved back to protoutils
- raw_bytes = named_tensor.data_bytes
- metadata = [
- {
- "int_to_float": proto.int_to_float,
- "int_list": proto.int_list,
- "bool_list": proto.bool_list,
- }
- for proto in named_tensor.transformer_metadata
- ]
- # The tensor has already been transferred to collaborator, so
- # the newly constructed tensor should have the collaborator origin
- tensor_key = TensorKey(
- named_tensor.name,
- self.collaborator_name,
- named_tensor.round_number,
- named_tensor.report,
- tuple(named_tensor.tags),
- )
- *_, tags = tensor_key
- if "compressed" in tags:
- decompressed_tensor_key, decompressed_nparray = self.tensor_codec.decompress(
- tensor_key,
- data=raw_bytes,
- transformer_metadata=metadata,
- require_lossless=True,
- )
- elif "lossy_compressed" in tags:
- decompressed_tensor_key, decompressed_nparray = self.tensor_codec.decompress(
- tensor_key, data=raw_bytes, transformer_metadata=metadata
- )
- else:
- # There could be a case where the compression pipeline is bypassed
- # entirely
- logger.warning("Bypassing tensor codec...")
- decompressed_tensor_key = tensor_key
- decompressed_nparray = raw_bytes
-
- self.tensor_db.cache_tensor({decompressed_tensor_key: decompressed_nparray})
-
- return decompressed_nparray
-
def _apply_masks(
self,
tensor_dict,
@@ -577,3 +484,27 @@ def _apply_masks(
continue
masked_metric = np.add(self._private_mask, tensor_dict[tensor_key])
tensor_dict[tensor_key] = np.add(masked_metric, self._shared_mask)
+
+ def _find_dependencies(self, tensor_key):
+ """Resolve the tensors required to do the specified operation.
+
+ Args:
+ tensor_key: A tuple containing the tensor name, origin, round
+ number, report, and tags.
+
+ Returns:
+ tensor_key_dependencies: A list of tensor keys that are
+ dependencies of the given tensor key.
+ """
+ tensor_key_dependencies = []
+
+ tensor_name, origin, round_number, report, tags = tensor_key
+
+ if "model" in tags and self.use_delta_updates:
+ if round_number >= 1:
+ # The new model can be generated by previous model + delta
+ tensor_key_dependencies.append(
+ TensorKey(tensor_name, origin, round_number - 1, report, tags)
+ )
+
+ return tensor_key_dependencies
diff --git a/openfl/databases/tensor_db.py b/openfl/databases/tensor_db.py
index da78110df5..fc4b197212 100644
--- a/openfl/databases/tensor_db.py
+++ b/openfl/databases/tensor_db.py
@@ -149,7 +149,7 @@ def get_tensor_from_cache(self, tensor_key: TensorKey) -> Optional[np.ndarray]:
if len(df) == 0:
return None
- return np.array(df["nparray"].iloc[0])
+ return np.asarray(df["nparray"].iloc[0])
def get_tensors_by_round_and_tags(self, fl_round: int, tags: tuple) -> dict:
"""Retrieve all tensors that match the specified round and tags.
@@ -231,7 +231,7 @@ def get_aggregated_tensor(
& (self.tensor_db["tags"] == tags)
]["nparray"]
if len(raw_df) > 0:
- return np.array(raw_df.iloc[0]), {}
+ return np.asarray(raw_df.iloc[0]), {}
for col in collaborator_names:
new_tags = change_tags(tags, add_field=col)
@@ -277,7 +277,7 @@ def get_aggregated_tensor(
agg_nparray = aggregation_function(local_tensors, db_iterator, tensor_name, fl_round, tags)
self.cache_tensor({tensor_key: agg_nparray})
- return np.array(agg_nparray)
+ return np.asarray(agg_nparray)
def _iterate(self, order_by: str = "round", ascending: bool = False) -> Iterator[pd.Series]:
"""Returns an iterator over the rows of the TensorDB, sorted by a
diff --git a/openfl/federated/plan/plan.py b/openfl/federated/plan/plan.py
index a324934e76..5398ded85b 100644
--- a/openfl/federated/plan/plan.py
+++ b/openfl/federated/plan/plan.py
@@ -15,7 +15,9 @@
from openfl.interface.aggregation_functions import AggregationFunction, WeightedAverage
from openfl.interface.cli_helper import WORKSPACE
+from openfl.pipelines import NoCompressionPipeline
from openfl.transport import AggregatorGRPCClient, AggregatorGRPCServer
+from openfl.transport.serialiser.collaborator import CollaboratorSerialiser
from openfl.utilities.utils import getfqdn_env
SETTINGS = "settings"
@@ -531,7 +533,6 @@ def get_collaborator(
data_loader = self.get_data_loader(collaborator_name)
defaults[SETTINGS]["task_runner"] = self.get_task_runner(data_loader)
- defaults[SETTINGS]["compression_pipeline"] = self.get_tensor_pipe()
defaults[SETTINGS]["task_config"] = self.config.get("tasks", {})
# Check if secure aggregation is enabled.
defaults[SETTINGS]["secure_aggregation"] = (
@@ -549,6 +550,12 @@ def get_collaborator(
certificate,
)
+ defaults[SETTINGS]["serialisation_middleware"] = CollaboratorSerialiser(
+ collaborator_name,
+ defaults[SETTINGS]["client"],
+ self.get_tensor_pipe() or NoCompressionPipeline,
+ )
+
if self.collaborator_ is None:
self.collaborator_ = Plan.build(**defaults)
diff --git a/openfl/interface/aggregation_functions/weighted_average.py b/openfl/interface/aggregation_functions/weighted_average.py
index 4c0e9ef55f..c48c166e84 100644
--- a/openfl/interface/aggregation_functions/weighted_average.py
+++ b/openfl/interface/aggregation_functions/weighted_average.py
@@ -50,5 +50,6 @@ def call(self, local_tensors, *_) -> np.ndarray:
Returns:
np.ndarray: aggregated tensor
"""
- tensors, weights = zip(*[(x.tensor, x.weight) for x in local_tensors])
+ tensors = np.asarray([x.tensor for x in local_tensors], dtype=np.float32)
+ weights = np.asarray([x.weight for x in local_tensors], dtype=np.float32)
return weighted_average(tensors, weights)
diff --git a/openfl/pipelines/pipeline.py b/openfl/pipelines/pipeline.py
index 66ec44052c..e83771b79e 100644
--- a/openfl/pipelines/pipeline.py
+++ b/openfl/pipelines/pipeline.py
@@ -56,7 +56,7 @@ def __init__(self):
"""Initialize Float32NumpyArrayToBytes."""
self.lossy = False
- def forward(self, data, **kwargs):
+ def forward(self, data: np.ndarray, **kwargs):
"""Convert a float32 Numpy array to bytes.
Args:
@@ -127,8 +127,6 @@ def forward(self, data, **kwargs):
data: The transformed data.
transformer_metadata: The metadata for the transformation.
"""
- transformer_metadata = []
-
# dataformat::numpy::float.32
# model proto:: a collection of tensor_dict proto
# protobuff::-> a layer of weights
@@ -141,7 +139,7 @@ def forward(self, data, **kwargs):
# input:: (data(bytes), transformer_metadata_list::a list of dictionary
# from int to float)
- data = data.copy()
+ transformer_metadata = []
for transformer in self.transformers:
data, metadata = transformer.forward(data=data, **kwargs)
transformer_metadata.append(metadata)
diff --git a/openfl/pipelines/tensor_codec.py b/openfl/pipelines/tensor_codec.py
index 15edde0965..a35397aad9 100644
--- a/openfl/pipelines/tensor_codec.py
+++ b/openfl/pipelines/tensor_codec.py
@@ -4,9 +4,8 @@
"""TensorCodec module."""
-import numpy as np
-
from openfl.pipelines import NoCompressionPipeline
+from openfl.protocols import utils
from openfl.utilities import TensorKey, change_tags
@@ -29,7 +28,7 @@ def __init__(self, compression_pipeline):
Args:
compression_pipeline: The pipeline used for compression.
"""
- self.compression_pipeline = compression_pipeline
+ self.compression_pipeline = compression_pipeline or NoCompressionPipeline()
if self.compression_pipeline.is_lossy():
self.lossless_pipeline = NoCompressionPipeline()
else:
@@ -146,98 +145,80 @@ def decompress(
return decompressed_tensor_key, decompressed_nparray
- @staticmethod
- def generate_delta(tensor_key, nparray, base_model_nparray):
- """Create delta from the updated layer and base layer.
+ def deserialise(self, named_tensor, collaborator_name):
+ """Convert named tensor to a numpy array.
Args:
- tensor_key: This is the tensor_key associated with the nparray.
- Should have a tag of 'trained' or 'aggregated'
- nparray: The nparray that corresponds to the tensorkey.
- base_model_nparray: The base model tensor that will be subtracted
- from the new weights.
+ named_tensor (protobuf): The tensor to convert to nparray.
+ collaborator_name (str): Name of teh collaborator for which the named tensor is to
+ be deserialised.
Returns:
- delta_tensor_key: Tensorkey that corresponds to the delta weight
- array.
- delta: Difference between the provided tensors.
+ decompressed_nparray (nparray): The nparray converted.
"""
- tensor_name, origin, round_number, report, tags = tensor_key
- if not np.isscalar(nparray):
- assert nparray.shape == base_model_nparray.shape, (
- f"Shape of updated layer ({nparray.shape}) is not equal to base "
- f"layer shape of ({base_model_nparray.shape})"
- )
- assert "model" not in tags, (
- "The tensorkey should be provided from the layer with new weights, not the base model"
+ raw_bytes = named_tensor.data_bytes
+ metadata = [
+ {
+ "int_to_float": proto.int_to_float,
+ "int_list": proto.int_list,
+ "bool_list": proto.bool_list,
+ }
+ for proto in named_tensor.transformer_metadata
+ ]
+ # The tensor has already been transferred to collaborator, so
+ # the newly constructed tensor should have the collaborator origin
+ tensor_key = TensorKey(
+ named_tensor.name,
+ collaborator_name,
+ named_tensor.round_number,
+ named_tensor.report,
+ tuple(named_tensor.tags),
)
- new_tags = change_tags(tags, add_field="delta")
- delta_tensor_key = TensorKey(tensor_name, origin, round_number, report, new_tags)
- return delta_tensor_key, nparray - base_model_nparray
-
- @staticmethod
- def apply_delta(tensor_key, delta, base_model_nparray, creates_model=False):
- """Add delta to the nparray.
-
- Args:
- tensor_key: This is the tensor_key associated with the delta.
- Should have a tag of 'trained' or 'aggregated'.
- delta: Weight delta between the new model and old model.
- base_model_nparray: The nparray that corresponds to the prior
- weights.
- creates_model: If flag is set, the tensorkey returned will
- correspond to the aggregator model.
-
- Returns:
- new_model_tensor_key: Latest model layer tensorkey.
- new_model_nparray: Latest layer weights.
- """
- tensor_name, origin, round_number, report, tags = tensor_key
- if not np.isscalar(base_model_nparray):
- assert delta.shape == base_model_nparray.shape, (
- f"Shape of delta ({delta.shape}) is not equal to shape of model"
- f" layer ({base_model_nparray.shape})"
- )
- # assert('model' in tensor_key[3]), 'The tensorkey should be provided
- # from the base model'
- # Aggregator UUID has the prefix 'aggregator'
- if "aggregator" in origin and not creates_model:
- new_tags = change_tags(tags, remove_field="delta")
- new_model_tensor_key = TensorKey(tensor_name, origin, round_number, report, new_tags)
+ *_, tags = tensor_key
+ decompressed_tensor_key = None
+ decompressed_nparray = None
+
+ if "compressed" in tags:
+ lossless = True
+ elif "lossy_compressed" in tags:
+ lossless = False
else:
- new_model_tensor_key = TensorKey(tensor_name, origin, round_number, report, ("model",))
+ # There could be a case where the compression pipeline is bypassed
+ # entirely
+ decompressed_tensor_key = tensor_key
+ decompressed_nparray = raw_bytes
+
+ if not (decompressed_tensor_key and decompressed_nparray):
+ decompressed_tensor_key, decompressed_nparray = self.decompress(
+ tensor_key,
+ data=raw_bytes,
+ transformer_metadata=metadata,
+ require_lossless=lossless,
+ )
+
+ return decompressed_tensor_key, decompressed_nparray
- return new_model_tensor_key, base_model_nparray + delta
+ def serialise(self, tensor_key, nparray, lossless=True):
+ """Construct the NamedTensor Protobuf.
- def find_dependencies(self, tensor_key, send_model_deltas):
- """Resolve the tensors required to do the specified operation.
+ Includes logic to create delta, compress tensors with the TensorCodec,
+ etc.
Args:
- tensor_key: A tuple containing the tensor name, origin, round
- number, report, and tags.
- send_model_deltas: A boolean flag indicating whether to send model
- deltas.
+ tensor_key (namedtuple): Tensorkey that will be resolved locally or
+ remotely. May be the product of other tensors.
+ nparray: The decompressed tensor associated with the requested
+ tensor key.
Returns:
- tensor_key_dependencies: A list of tensor keys that are
- dependencies of the given tensor key.
+ named_tensor (protobuf) : The tensor constructed from the nparray.
"""
- tensor_key_dependencies = []
+ compressed_tensor_key, compressed_nparray, metadata = self.compress(
+ tensor_key, nparray, require_lossless=lossless
+ )
- tensor_name, origin, round_number, report, tags = tensor_key
+ named_tensor = utils.construct_named_tensor(
+ compressed_tensor_key, compressed_nparray, metadata, lossless=lossless
+ )
- if "model" in tags and send_model_deltas:
- if round_number >= 1:
- # The new model can be generated by previous model + delta
- tensor_key_dependencies.append(
- TensorKey(tensor_name, origin, round_number - 1, report, tags)
- )
- if self.compression_pipeline.is_lossy():
- new_tags = ("aggregated", "delta", "lossy_compressed")
- else:
- new_tags = ("aggregated", "delta", "compressed")
- tensor_key_dependencies.append(
- TensorKey(tensor_name, origin, round_number, report, new_tags)
- )
-
- return tensor_key_dependencies
+ return named_tensor
diff --git a/openfl/protocols/utils.py b/openfl/protocols/utils.py
index a40f3598d1..d19cd594f0 100644
--- a/openfl/protocols/utils.py
+++ b/openfl/protocols/utils.py
@@ -308,14 +308,12 @@ def datastream_to_proto(proto, stream):
Returns:
proto: The protobuf filled with the data stream.
"""
- npbytes = b""
+ npbytes = bytearray()
for chunk in stream:
- npbytes += chunk.npbytes
+ npbytes.extend(chunk.npbytes)
if len(npbytes) > 0:
- proto.ParseFromString(npbytes)
- if logger is not None:
- logger.debug("datastream_to_proto parsed a %s.", type(proto))
+ proto.ParseFromString(bytes(npbytes))
return proto
else:
raise RuntimeError(f"Received empty stream message of type {type(proto)}")
@@ -336,11 +334,6 @@ def proto_to_datastream(proto, max_buffer_size=(2 * 1024 * 1024)):
npbytes = proto.SerializeToString()
data_size = len(npbytes)
buffer_size = data_size if max_buffer_size > data_size else max_buffer_size
- logger.debug(
- "Setting stream chunks with size %s for proto of type %s",
- buffer_size,
- type(proto),
- )
for i in range(0, data_size, buffer_size):
chunk = npbytes[i : i + buffer_size]
diff --git a/openfl/transport/grpc/aggregator_server.py b/openfl/transport/grpc/aggregator_server.py
index a23015b1a8..220e34dc8f 100644
--- a/openfl/transport/grpc/aggregator_server.py
+++ b/openfl/transport/grpc/aggregator_server.py
@@ -11,7 +11,7 @@
import grpc
from openfl.protocols import aggregator_pb2, aggregator_pb2_grpc, utils
-from openfl.transport.grpc.common import create_grpc_server, create_header
+from openfl.transport.grpc.common import create_grpc_server, create_header, synchronized
logger = logging.getLogger(__name__)
@@ -217,35 +217,29 @@ def GetAggregatedTensor(self, request, context): # NOQA:N802
self.validate_collaborator(request, context)
self.check_request(request)
- collaborator_name = request.header.sender
- tensor_name = request.tensor_name
- require_lossless = request.require_lossless
- round_number = request.round_number
- report = request.report
- tags = tuple(request.tags)
named_tensor = self.aggregator.get_aggregated_tensor(
- collaborator_name,
- tensor_name,
- round_number,
- report,
- tags,
- require_lossless,
+ request.tensor_name,
+ request.round_number,
+ request.report,
+ tuple(request.tags),
+ request.require_lossless,
)
header = create_header(
sender=self.aggregator.uuid,
- receiver=collaborator_name,
+ receiver=request.header.sender,
federation_uuid=self.aggregator.federation_uuid,
single_col_cert_common_name=self.aggregator.single_col_cert_common_name,
)
return aggregator_pb2.GetAggregatedTensorResponse(
header=header,
- round_number=round_number,
+ round_number=request.round_number,
tensor=named_tensor,
)
+ @synchronized
def SendLocalTaskResults(self, request, context): # NOQA:N802
"""Request a model download from aggregator.
diff --git a/openfl/transport/grpc/common.py b/openfl/transport/grpc/common.py
index b91615ca04..4d719b36f6 100644
--- a/openfl/transport/grpc/common.py
+++ b/openfl/transport/grpc/common.py
@@ -3,6 +3,7 @@
"""Common functions for gRPC transport."""
import logging
+import threading
from concurrent.futures import ThreadPoolExecutor
import grpc
@@ -22,6 +23,17 @@
]
+def synchronized(func):
+ """Executes `func` synchronously in a threading lock."""
+ _lock = threading.Lock()
+
+ def wrapper(self, *args, **kwargs):
+ with _lock:
+ return func(self, *args, **kwargs)
+
+ return wrapper
+
+
def create_insecure_channel(uri) -> grpc.Channel:
"""Creates an insecure gRPC channel."""
return grpc.insecure_channel(uri, options=DEFAULT_CHANNEL_OPTIONS)
diff --git a/openfl/transport/serialiser/__init__.py b/openfl/transport/serialiser/__init__.py
new file mode 100644
index 0000000000..c416f7c523
--- /dev/null
+++ b/openfl/transport/serialiser/__init__.py
@@ -0,0 +1,4 @@
+# Copyright 2020-2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from openfl.transport.serialiser.collaborator import CollaboratorSerialiser
diff --git a/openfl/transport/serialiser/collaborator.py b/openfl/transport/serialiser/collaborator.py
new file mode 100644
index 0000000000..1080657435
--- /dev/null
+++ b/openfl/transport/serialiser/collaborator.py
@@ -0,0 +1,95 @@
+# Copyright 2020-2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This file contains the CollaboratorSerialiser which is used as a middleware between the
+collaborator component and aggregator client.
+"""
+
+from openfl.pipelines import TensorCodec
+
+
+class CollaboratorSerialiser:
+ """
+ A class used to serialise and deserialise tensors for a collaborator in a federated learning
+ setup.
+ """
+
+ def __init__(
+ self,
+ collaborator_name,
+ client,
+ compression_pipeline,
+ ):
+ self._collaborator_name = collaborator_name
+ self._aggregator_client = client
+ self._tensor_codec = TensorCodec(compression_pipeline)
+
+ def get_tasks(self):
+ """
+ Retrieves the tasks assigned to the collaborator from the aggregator client.
+ """
+ return self._aggregator_client.get_tasks()
+
+ def get_aggregated_tensor(
+ self,
+ tensor_name: str,
+ round_number: int,
+ report: bool,
+ tags: tuple,
+ require_lossless: bool,
+ ):
+ """
+ Retrieves and deserializes an aggregated tensor from the aggregator.
+
+ Args:
+ tensor_name (str): The name of the tensor to retrieve.
+ round_number (int): The round number associated with the tensor.
+ report (bool): Whether to report the retrieval process.
+ tags (tuple): Tags associated with the tensor.
+ require_lossless (bool): Whether lossless retrieval is required.
+
+ Returns:
+ tuple: A tuple containing the tensor key and the deserialized numpy array.
+ """
+ tensor = self._aggregator_client.get_aggregated_tensor(
+ tensor_name,
+ round_number,
+ report,
+ tags,
+ require_lossless,
+ )
+ tensor_key, nparray = self._tensor_codec.deserialise(tensor, self._collaborator_name)
+
+ return tensor_key, nparray
+
+ def send_local_task_results(
+ self,
+ round_number: int,
+ task_name: str,
+ data_size: int = None,
+ tensor_dict: dict = {},
+ ):
+ """
+ Sends the local task results to the aggregator client.
+
+ Args:
+ round_number (int): The current round number of the task.
+ task_name (str): The name of the task.
+ data_size (int, optional): The size of the data. Defaults to None.
+ tensor_dict (dict, optional): A dictionary where keys are tensor names and values
+ are numpy arrays. Defaults to {}.
+
+ Returns:
+ None
+ """
+ named_tensors = [
+ self._tensor_codec.serialise(tensor_key, value[0], lossless=value[1])
+ for tensor_key, value in tensor_dict.items()
+ ]
+ self._aggregator_client.send_local_task_results(
+ round_number,
+ task_name,
+ data_size,
+ named_tensors,
+ )
diff --git a/openfl/utilities/utils.py b/openfl/utilities/utils.py
index bab2ccc8c3..ba2a57218c 100644
--- a/openfl/utilities/utils.py
+++ b/openfl/utilities/utils.py
@@ -263,3 +263,54 @@ def remove_readonly(func, path, _):
func(path)
return shutil.rmtree(path, ignore_errors=ignore_errors, onerror=remove_readonly)
+
+
+def generate_delta(tensor_key, nparray, base_model_nparray):
+ """Create delta from the updated layer and base layer.
+
+ Args:
+ tensor_key: This is the tensor_key associated with the nparray.
+ Should have a tag of 'trained' or 'aggregated'
+ nparray: The nparray that corresponds to the tensorkey.
+ base_model_nparray: The base model tensor that will be subtracted
+ from the new weights.
+
+ Returns:
+ delta_tensor_key: Tensorkey that corresponds to the delta weight
+ array.
+ delta: Difference between the provided tensors.
+ """
+ tensor_key = tensor_key._replace(
+ tags=(
+ *tensor_key.tags,
+ "delta",
+ )
+ )
+
+ return tensor_key, nparray - base_model_nparray
+
+
+def apply_delta(tensor_key, delta, base_model_nparray, creates_model=False):
+ """Add delta to the nparray.
+
+ Args:
+ tensor_key: This is the tensor_key associated with the delta.
+ Should have a tag of 'trained' or 'aggregated'.
+ delta: Weight delta between the new model and old model.
+ base_model_nparray: The nparray that corresponds to the prior
+ weights.
+ creates_model: If flag is set, the tensorkey returned will
+ correspond to the aggregator model.
+
+ Returns:
+ new_model_tensor_key: Latest model layer tensorkey.
+ new_model_nparray: Latest layer weights.
+ """
+ if "aggregator" in tensor_key.tags and not creates_model:
+ tensor_key = tensor_key._replace(
+ tags=tuple(tag for tag in tensor_key.tags if tag != "delta")
+ )
+ else:
+ tensor_key = tensor_key._replace(tags=("model",))
+
+ return tensor_key, base_model_nparray + delta
diff --git a/setup.py b/setup.py
index f1de927a72..8b38003e3e 100644
--- a/setup.py
+++ b/setup.py
@@ -60,7 +60,7 @@ def run(self):
setup(
name='openfl',
- version='1.8.0.dev0',
+ version='1.9.0.dev',
author='OpenFL Team',
description='Federated Learning for the Edge',
long_description=open("README.md", encoding="utf-8").read(),
diff --git a/test-requirements.txt b/test-requirements.txt
index f6bc5071ed..112a8087ac 100644
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -1,10 +1,10 @@
docker
lxml==5.3.1
paramiko
-pytest==8.3.4
-pytest-asyncio==0.25.3
+pytest==8.3.5
+pytest-asyncio==0.26.0
pytest-mock==3.14.0
defusedxml==0.7.1
matplotlib==3.10.1
-fpdf==1.7.2
+fpdf2==2.8.2
papermill==2.6.0
diff --git a/tests/end_to_end/models/aggregator.py b/tests/end_to_end/models/aggregator.py
index ca6bb32ab6..9c0cabc32d 100644
--- a/tests/end_to_end/models/aggregator.py
+++ b/tests/end_to_end/models/aggregator.py
@@ -1,13 +1,13 @@
-# Copyright 2020-2023 Intel Corporation
+# Copyright 2020-2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import logging
import os
import tempfile
-import tests.end_to_end.utils.constants as constants
import tests.end_to_end.utils.exceptions as ex
import tests.end_to_end.utils.federation_helper as fh
+import tests.end_to_end.utils.ssh_helper as ssh
log = logging.getLogger(__name__)
@@ -37,6 +37,7 @@ def __init__(self, agg_domain_name, workspace_path, eval_scope=False, container_
self.container_id = container_id
self.tensor_db_file = os.path.join(self.workspace_path, "local_state", "tensor.db")
self.res_file = None # Result file to track the logs
+ self.start_process = None # Process associated with the aggregator start command
def generate_sign_request(self):
"""
@@ -70,16 +71,24 @@ def start(self):
log_file = os.path.join("logs", "aggregator.log")
self.res_file = os.path.join(self.workspace_path, log_file)
- command = f"LOG_FILE={log_file} {constants.AGG_START_CMD}"
+ command = ["fx", "aggregator", "start"]
if self.eval_scope:
- command += " --task_group evaluation"
- fh.run_command(
- command,
- error_msg=error_msg,
- container_id=self.container_id,
- workspace_path=self.workspace_path,
- run_in_background=True,
- bg_file=os.path.join(tempfile.mkdtemp(), "tmp.log"), # this file is simply to keep the process running
+ command.append("--task_group")
+ command.append("evaluation")
+ log.info(f"Command for {self.name}: {command}")
+
+ # Set the log file path for the aggregator process
+ env = os.environ.copy()
+ env["LOG_FILE"] = log_file
+
+ # open file in append mode, so that restarting scenarios can be handled
+ bg_file = open(os.path.join(tempfile.mkdtemp(), "tmp.log"), "a", buffering=1)
+ self.start_process = ssh.run_command_background(
+ cmd=command,
+ work_dir=self.workspace_path,
+ redirect_to_file=bg_file,
+ check_sleep=60,
+ env=env
)
log.info(
@@ -90,6 +99,21 @@ def start(self):
raise e
return True
+ def kill_process(self):
+ """
+ Kill the process of the aggregator and wait for it to finish
+ """
+ try:
+ if self.start_process:
+ self.start_process.kill()
+ self.start_process.wait()
+ self.start_process = None
+ else:
+ log.warning("No process found for aggregator")
+ except Exception as e:
+ log.error(f"Failed to kill the process: {e}")
+ raise ex.ProcessKillException(f"Failed to kill the process: {e}")
+
def modify_data_file(self, data_file, col_name, index):
"""
Modify the data.yaml file for the model
diff --git a/tests/end_to_end/models/collaborator.py b/tests/end_to_end/models/collaborator.py
index ad06f96657..a0c8fe420a 100644
--- a/tests/end_to_end/models/collaborator.py
+++ b/tests/end_to_end/models/collaborator.py
@@ -1,13 +1,13 @@
-# Copyright 2020-2023 Intel Corporation
+# Copyright 2020-2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
import logging
import tempfile
-import tests.end_to_end.utils.constants as constants
import tests.end_to_end.utils.exceptions as ex
import tests.end_to_end.utils.federation_helper as fh
+import tests.end_to_end.utils.ssh_helper as ssh
log = logging.getLogger(__name__)
@@ -38,6 +38,7 @@ def __init__(self, collaborator_name=None, data_directory_path=None, workspace_p
self.workspace_path = workspace_path
self.container_id = container_id
self.res_file = None # Result file to track the logs
+ self.start_process = None # Process associated with the aggregator start command
def generate_sign_request(self):
"""
@@ -130,13 +131,21 @@ def start(self):
log_file = os.path.join("logs", f"{self.collaborator_name}.log")
self.res_file = os.path.join(self.workspace_path, log_file)
- fh.run_command(
- command=f"LOG_FILE={log_file} {constants.COL_START_CMD.format(self.collaborator_name)}",
- error_msg=error_msg,
- container_id=self.container_id,
- workspace_path=self.workspace_path,
- run_in_background=True,
- bg_file=os.path.join(tempfile.mkdtemp(), "tmp.log"), # this file is simply to keep the process running
+ command = ["fx", "collaborator", "start", "-n", self.collaborator_name]
+ log.info(f"Command for {self.name}: {command}")
+
+ # Set the log file path for the collaborator process
+ env = os.environ.copy()
+ env["LOG_FILE"] = log_file
+
+ # open file in append mode, so that restarting scenarios can be handled
+ bg_file = open(os.path.join(tempfile.mkdtemp(), "tmp.log"), "a", buffering=1)
+ self.start_process = ssh.run_command_background(
+ cmd=command,
+ work_dir=self.workspace_path,
+ redirect_to_file=bg_file,
+ check_sleep=60,
+ env=env
)
log.info(
@@ -147,25 +156,21 @@ def start(self):
raise e
return True
- def install_dependencies(self):
+ def kill_process(self):
"""
- Install the dependencies for the collaborator
- Returns:
- bool: True if successful, else False
+ Kill the process of the collaborator and wait for it to finish
"""
try:
- cmd = f"pip install -r requirements.txt"
- error_msg = f"Failed to install dependencies for {self.collaborator_name}"
- return_code, output, error = fh.run_command(
- cmd,
- error_msg=error_msg,
- container_id=self.container_id,
- workspace_path=self.workspace_path,
- )
+ if self.start_process:
+ self.start_process.kill()
+ self.start_process.wait()
+ self.start_process = None
+ else:
+ log.warning(f"No process found for {self.collaborator_name}")
+
except Exception as e:
- log.error(f"{error_msg}: {e}")
- raise e
- return True
+ log.error(f"Failed to kill the process: {e}")
+ raise ex.ProcessKillException(f"Failed to kill the process: {e}")
def import_workspace(self):
"""
diff --git a/tests/end_to_end/test_suites/tr_flower_tests.py b/tests/end_to_end/test_suites/tr_flower_tests.py
new file mode 100644
index 0000000000..42fafc47a5
--- /dev/null
+++ b/tests/end_to_end/test_suites/tr_flower_tests.py
@@ -0,0 +1,58 @@
+# Copyright 2020-2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+import logging
+
+from tests.end_to_end.utils.tr_common_fixtures import (
+ fx_federation_tr,
+ fx_federation_tr_dws,
+)
+from tests.end_to_end.utils import federation_helper as fed_helper
+from tests.end_to_end.utils.exceptions import FlowerAppException
+
+log = logging.getLogger(__name__)
+
+
+@pytest.mark.task_runner_basic
+def test_flower_app_pytorch_native(request, fx_federation_tr):
+ """
+ Test federation via native task runner for Flower app with PyTorch.
+ Args:
+ request (Fixture): Pytest fixture
+ fx_federation_tr (Fixture): Pytest fixture for native task runner
+ """
+ if request.config.num_rounds != 1:
+ raise FlowerAppException("Flower app with PyTorch only supports 1 round of training.")
+
+ # Start the federation
+ assert fed_helper.run_federation(fx_federation_tr)
+
+ # Verify the completion of the federation run
+ assert fed_helper.verify_federation_run_completion(
+ fx_federation_tr,
+ test_env=request.config.test_env,
+ num_rounds=request.config.num_rounds,
+ ), "Federation completion failed"
+
+
+@pytest.mark.task_runner_dockerized_ws
+def test_flower_app_pytorch_dockerized_workspace(request, fx_federation_tr_dws):
+ """
+ Test federation via dockerized workspace for Flower app with PyTorch.
+ Args:
+ request (Fixture): Pytest fixture
+ fx_federation_tr_dws (Fixture): Pytest fixture for dockerized workspace
+ """
+ if request.config.num_rounds != 1:
+ raise FlowerAppException("Flower app with PyTorch only supports 1 round of training.")
+
+ # Start the federation
+ assert fed_helper.run_federation_for_dws(fx_federation_tr_dws, request.config.use_tls)
+
+ # Verify the completion of the federation run
+ assert fed_helper.verify_federation_run_completion(
+ fx_federation_tr_dws,
+ test_env=request.config.test_env,
+ num_rounds=request.config.num_rounds,
+ ), "Federation completion failed"
diff --git a/tests/end_to_end/test_suites/tr_resiliency_tests.py b/tests/end_to_end/test_suites/tr_resiliency_tests.py
index 55bf412ce2..940d1607b2 100644
--- a/tests/end_to_end/test_suites/tr_resiliency_tests.py
+++ b/tests/end_to_end/test_suites/tr_resiliency_tests.py
@@ -279,6 +279,7 @@ def _perform_restart_validate_rounds(fed_obj, db_file, total_rounds):
"""
init_round = fed_helper.get_current_round(db_file)
+ log.info(f"Round number is {init_round} before restarts")
# Restart aggregator
assert int_helper.restart_participants([fed_obj.aggregator])
diff --git a/tests/end_to_end/utils/constants.py b/tests/end_to_end/utils/constants.py
index a8c21f7a8e..fd13cc63f6 100644
--- a/tests/end_to_end/utils/constants.py
+++ b/tests/end_to_end/utils/constants.py
@@ -19,6 +19,7 @@ class ModelName(Enum):
TORCH_MNIST_STRAGGLER_CHECK = "torch/mnist_straggler_check"
XGB_HIGGS = "xgb_higgs"
GANDLF_SEG_TEST = "gandlf_seg_test"
+ FLOWER_APP_PYTORCH = "flower-app-pytorch"
NUM_COLLABORATORS = 2
NUM_ROUNDS = 5
@@ -43,7 +44,7 @@ class ModelName(Enum):
AGG_COL_RESULT_FILE = "{0}/{1}/workspace/{1}.log" # example - /tmp/my_federation/aggregator/workspace/aggregator.log
-AGG_WORKSPACE_ZIP_NAME = "workspace.zip"
+DFLT_WORKSPACE_NAME = "workspace"
# Memory logs related
AGG_MEM_USAGE_JSON = "{}/aggregator/workspace/logs/aggregator_memory_usage.json" # example - /tmp/my_federation/aggregator/workspace/logs/aggregator_memory_usage.json
@@ -55,5 +56,4 @@ class ModelName(Enum):
COL_END_MSG = "Received shutdown signal"
COL_CERTIFY_CMD = "fx collaborator certify --import 'agg_to_col_{}_signed_cert.zip'"
-DFLT_DOCKERIZE_IMAGE_NAME = "workspace"
EXCEPTION = "Exception"
diff --git a/tests/end_to_end/utils/docker_helper.py b/tests/end_to_end/utils/docker_helper.py
index 7a3affba55..e3cab5824f 100644
--- a/tests/end_to_end/utils/docker_helper.py
+++ b/tests/end_to_end/utils/docker_helper.py
@@ -94,7 +94,7 @@ def start_docker_container_with_federation_run(
else:
local_participant_path = participant.workspace_path
- docker_participant_path = f"/{constants.DFLT_DOCKERIZE_IMAGE_NAME}"
+ docker_participant_path = f"/{constants.DFLT_WORKSPACE_NAME}"
volumes = {
local_participant_path: {"bind": docker_participant_path, "mode": "rw"},
diff --git a/tests/end_to_end/utils/exceptions.py b/tests/end_to_end/utils/exceptions.py
index 999b52658c..e7c353eaa3 100644
--- a/tests/end_to_end/utils/exceptions.py
+++ b/tests/end_to_end/utils/exceptions.py
@@ -1,4 +1,4 @@
-# Copyright 2020-2023 Intel Corporation
+# Copyright 2020-2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
"""Module consists of custom exceptions for end to end testing"""
@@ -119,3 +119,13 @@ class TensorDBException(Exception):
class GaNDLFConfigSegException(Exception):
"""Exception for GaNDLF config segmentation file"""
pass
+
+
+class FlowerAppException(Exception):
+ """Exception for Flower app"""
+ pass
+
+
+class ProcessKillException(Exception):
+ """Exception for process kill"""
+ pass
diff --git a/tests/end_to_end/utils/federation_helper.py b/tests/end_to_end/utils/federation_helper.py
index 4f719355f7..f0a64c35bd 100644
--- a/tests/end_to_end/utils/federation_helper.py
+++ b/tests/end_to_end/utils/federation_helper.py
@@ -1,4 +1,4 @@
-# Copyright 2020-2023 Intel Corporation
+# Copyright 2020-2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import time
@@ -17,6 +17,7 @@
import tests.end_to_end.utils.db_helper as db_helper
import tests.end_to_end.utils.docker_helper as dh
import tests.end_to_end.utils.exceptions as ex
+import tests.end_to_end.utils.interruption_helper as intr_helper
import tests.end_to_end.utils.ssh_helper as ssh
from tests.end_to_end.models import collaborator as col_model
@@ -144,7 +145,7 @@ def _create_tarball(collaborator_name, data_file_path, local_bind_path, add_data
]
client_certs = " ".join(client_cert_entries) if client_cert_entries else ""
tarfiles += f" agg_to_col_{collaborator_name}_signed_cert.zip {client_certs}"
- # IMPORTANT: Model XGBoost(xgb_higgs) uses format like data/1 and data/2, thus adding data to tarball in the same format.
+ # IMPORTANT: Models XGBoost(xgb_higgs) and Flower use format like data/1 and data/2, thus adding data to tarball in the same format.
if add_data:
tarfiles += f" data/{data_file_path}"
@@ -218,18 +219,15 @@ def copy_file_between_participants(
return True
-def run_federation(fed_obj, install_dependencies=True):
+def run_federation(fed_obj):
"""
Start the federation
Args:
fed_obj (object): Federation fixture object
- install_dependencies (bool): Install dependencies on collaborators (default is True)
Returns:
bool: True if successful, else False
"""
executor = concurrent.futures.ThreadPoolExecutor()
- if install_dependencies:
- install_dependencies_on_collaborators(fed_obj)
# Set the backend (KERAS_BACKEND) for Keras as an environment variable
if "keras" in fed_obj.model_name:
@@ -263,7 +261,7 @@ def run_federation_for_dws(fed_obj, use_tls):
try:
container = dh.start_docker_container_with_federation_run(
participant=participant,
- image=constants.DFLT_DOCKERIZE_IMAGE_NAME,
+ image=constants.DFLT_WORKSPACE_NAME,
use_tls=use_tls,
env_keyval_list=set_keras_backend(fed_obj.model_name) if "keras" in fed_obj.model_name else None,
)
@@ -277,27 +275,6 @@ def run_federation_for_dws(fed_obj, use_tls):
return True
-def install_dependencies_on_collaborators(fed_obj):
- """
- Install dependencies on all the collaborators
- """
- executor = concurrent.futures.ThreadPoolExecutor()
- # Install dependencies on collaborators
- # This is a time taking process, thus doing at this stage after all verification is done
- log.info("Installing dependencies on collaborators. This might take some time...")
- futures = [
- executor.submit(participant.install_dependencies)
- for participant in fed_obj.collaborators
- ]
- results = [f.result() for f in futures]
- log.info(
- f"Results from all the collaborators for installation of dependencies: {results}"
- )
-
- if not all(results):
- raise Exception("Failed to install dependencies on one or more collaborators")
-
-
def verify_federation_run_completion(fed_obj, test_env, num_rounds):
"""
Verify the completion of the process for all the participants
@@ -380,20 +357,18 @@ def _verify_completion_for_participant(
time.sleep(45)
- # Verify that the process is completed successfully
- get_process_id = constants.AGG_START_CMD if participant.name == "aggregator" else constants.COL_START_CMD.format(participant.name)
-
- # Find the process ID
- pids = []
- for line in os.popen(f"ps ax | grep '{get_process_id}' | grep -v grep"):
- fields = line.split()
- pids.append(fields[0])
-
- if not pids:
- log.info(f"No processes found for participant {participant.name}")
- break
+ # If process.poll() has a value, it means the process has completed
+ # If None, it means the process is still running
+ # This is applicable for native process only
+ if participant.start_process:
+ if participant.start_process.poll():
+ log.info(f"No processes found for participant {participant.name}")
+ break
+ else:
+ log.info(f"Process is yet to complete for {participant.name}")
else:
- log.info(f"Process is yet to complete for {participant.name}")
+ # Dockerized workspace scenario
+ log.info(f"No process found for participant {participant.name}")
# Read tensor.db file for aggregator to check if the process is completed
if participant.name == "aggregator" and num_rounds > 1:
@@ -419,7 +394,7 @@ def federation_env_setup_and_validate(request, eval_scope=False):
test_env = request.config.test_env
# Validate the model name and create the workspace name
- if not request.config.model_name.replace("/", "_").upper() in constants.ModelName._member_names_:
+ if not request.config.model_name.replace("/", "_").replace("-", "_").upper() in constants.ModelName._member_names_:
raise ValueError(f"Invalid model name: {request.config.model_name}")
# Set the workspace path specific to the model and the test case
@@ -462,30 +437,6 @@ def federation_env_setup_and_validate(request, eval_scope=False):
return workspace_path, local_bind_path, agg_domain_name
-def add_local_workspace_permission(local_bind_path):
- """
- Add permission to workspace. This is aggregator/model owner specific operation.
- Args:
- workspace_path (str): Workspace path
- agg_container_id (str): Container ID
- """
- try:
- agg_workspace_path = constants.AGG_WORKSPACE_PATH.format(local_bind_path)
- return_code, output, error = run_command(
- f"sudo chmod -R 777 {agg_workspace_path}",
- workspace_path=local_bind_path,
- )
- if return_code != 0:
- raise Exception(f"Failed to add local permission to workspace: {error}")
-
- log.debug(
- f"Recursive permission added to workspace on local machine: {agg_workspace_path}"
- )
- except Exception as e:
- log.error(f"Failed to add local permission to workspace: {e}")
- raise e
-
-
def create_persistent_store(participant_name, local_bind_path):
"""
Create persistent store for the participant on local machine (even for docker)
@@ -498,8 +449,7 @@ def create_persistent_store(participant_name, local_bind_path):
error_msg = f"Failed to create persistent store for {participant_name}"
cmd_persistent_store = (
f"export WORKING_DIRECTORY={local_bind_path}; "
- f"mkdir -p $WORKING_DIRECTORY/{participant_name}/workspace; "
- "sudo chmod -R 755 $WORKING_DIRECTORY"
+ f"mkdir -p $WORKING_DIRECTORY/{participant_name}/workspace"
)
log.debug(f"Creating persistent store")
return_code, output, error = run_command(
@@ -639,7 +589,7 @@ def setup_collaborator(index, workspace_path, local_bind_path):
local_bind_path, collaborator.name
)
copy_file_between_participants(
- local_agg_ws_path, local_col_ws_path, constants.AGG_WORKSPACE_ZIP_NAME
+ local_agg_ws_path, local_col_ws_path, f"{constants.DFLT_WORKSPACE_NAME}.zip"
)
collaborator.import_workspace()
except Exception as e:
@@ -674,6 +624,8 @@ def setup_collaborator_data(collaborators, model_name, local_bind_path):
# Below step will also modify the data.yaml file for all the collaborators
if model_name == constants.ModelName.XGB_HIGGS.value:
download_higgs_data(collaborators, local_bind_path)
+ elif model_name == constants.ModelName.FLOWER_APP_PYTORCH.value:
+ download_flower_data(collaborators, local_bind_path)
log.info("Data setup is complete for all the collaborators")
@@ -741,6 +693,19 @@ def copy_gandlf_data_to_collaborators(aggregator, collaborators, local_bind_path
raise ex.DataSetupException(f"Failed to modify the data file: {e}")
+def download_flower_data(collaborators, local_bind_path):
+ """
+ Download the data for the model and copy to the respective collaborator workspaces
+ Also modify the data.yaml file for all the collaborators
+ Args:
+ collaborators (list): List of collaborator objects
+ local_bind_path (str): Local bind path
+ Returns:
+ bool: True if successful, else False
+ """
+ common_download_for_higgs_and_flower(collaborators, local_bind_path)
+
+
def download_higgs_data(collaborators, local_bind_path):
"""
Download the data for the model and copy to the respective collaborator workspaces
@@ -751,6 +716,15 @@ def download_higgs_data(collaborators, local_bind_path):
Returns:
bool: True if successful, else False
"""
+ common_download_for_higgs_and_flower(collaborators, local_bind_path)
+
+
+def common_download_for_higgs_and_flower(collaborators, local_bind_path):
+ """
+ Common function to download the data for both Higgs and Flower models.
+ In future, if the data setup for other models is similar, we can use this function.
+ Also, if the setup changes for any of the models, we can modify this function to accommodate the changes.
+ """
log.info(f"Copying {constants.DATA_SETUP_FILE} from one of the collaborator workspaces to the local bind path..")
try:
shutil.copyfile(
@@ -765,7 +739,7 @@ def download_higgs_data(collaborators, local_bind_path):
command = ["python", constants.DATA_SETUP_FILE, str(len(collaborators))]
subprocess.run(command, cwd=local_bind_path, check=True) # nosec B603
except Exception:
- raise ex.DataSetupException(f"Failed to download data for XGBoost model")
+ raise ex.DataSetupException(f"Failed to download data for given model")
try:
# Copy the data to the respective workspaces based on the index
@@ -786,9 +760,9 @@ def download_higgs_data(collaborators, local_bind_path):
except Exception as e:
raise ex.DataSetupException(f"Failed to modify the data file: {e}")
- # Below step is specific to XGBoost model which uses higgs_data folder to create data folders.
+ # XGBoost model uses folder name higgs_data and Flower model uses data to create data folders.
shutil.rmtree(os.path.join(local_bind_path, "higgs_data"), ignore_errors=True)
-
+ shutil.rmtree(os.path.join(local_bind_path, "data"), ignore_errors=True)
return True
@@ -1102,52 +1076,27 @@ def set_keras_backend(model_name):
return [f"KERAS_BACKEND={backend}"]
-def remove_stale_processes(num_collaborators=0, envoys=[], director=False):
+def remove_stale_processes(aggregator=None, collaborators=[], director=None, envoys=[]):
"""
Remove stale processes
+ Args:
+ aggregator (object): Aggregator object
+ collaborators (list): List of collaborator objects
+ director (object): Director object
+ envoys (list): List of envoy objects
"""
- if num_collaborators > 0:
- log.info("Removing stale processes..")
- # Remove any stale processes
- try:
- for i in range(1, num_collaborators + 1):
- subprocess.run(
- f"sudo kill -9 $(ps -ef | grep 'collaborator{i}' | awk '{{print $2}}')",
- shell=True,
- check=True,
- stdout=subprocess.DEVNULL,
- stderr=subprocess.DEVNULL
- )
- subprocess.run(
- "sudo kill -9 $(ps -ef | grep 'aggregator' | awk '{{print $2}}')",
- shell=True,
- check=True,
- stdout=subprocess.DEVNULL,
- stderr=subprocess.DEVNULL
- )
- except subprocess.CalledProcessError as e:
- log.warning(f"Failed to kill processes: {e}")
+ if aggregator:
+ intr_helper.kill_processes(aggregator.name)
+
+ for collaborators in collaborators:
+ intr_helper.kill_processes(collaborators.name)
if director:
- try:
- subprocess.run(
- "sudo kill -9 $(ps -ef | grep 'director' | awk '{{print $2}}')",
- shell=True,
- check=True,
- )
- except subprocess.CalledProcessError as e:
- log.warning(f"Failed to kill processes: {e}")
-
- if envoys:
- for envoy in envoys:
- try:
- subprocess.run(
- f"sudo kill -9 $(ps -ef | grep '{envoy}' | awk '{{print $2}}')",
- shell=True,
- check=True,
- )
- except subprocess.CalledProcessError as e:
- log.warning(f"Failed to kill processes: {e}")
+ intr_helper.kill_processes("director")
+
+ for envoy in envoys:
+ intr_helper.kill_processes(envoy)
+
log.info("Stale processes (if any) removed successfully")
diff --git a/tests/end_to_end/utils/interruption_helper.py b/tests/end_to_end/utils/interruption_helper.py
index 4e6eca2c46..2fd8686939 100644
--- a/tests/end_to_end/utils/interruption_helper.py
+++ b/tests/end_to_end/utils/interruption_helper.py
@@ -4,7 +4,7 @@
import logging
import concurrent.futures
import time
-import os
+import psutil
import subprocess # nosec B404
import tests.end_to_end.utils.constants as constants
@@ -75,30 +75,65 @@ def stop_start_native_participant(participant, action):
if action not in ["stop", "start"]:
raise ex.ParticipantStopException(f"Invalid action {action}")
- if action == "stop":
- log.info(f"Stopping participant {participant.name}")
- cmd_for_process_kill = constants.AGG_START_CMD if participant.name == "aggregator" else constants.COL_START_CMD.format(participant.name)
- pids = []
- # Find the process ID
- for line in os.popen(f"ps ax | grep '{cmd_for_process_kill}' | grep -v grep"):
- fields = line.split()
- pids.append(fields[0])
-
- if not pids:
- raise RuntimeError(f"No processes found for command '{cmd_for_process_kill}'")
-
- # Kill all processes using sudo
- for pid in pids:
- try:
- subprocess.run(['sudo', 'kill', '-9', pid], check=True)
- except subprocess.CalledProcessError as e:
- raise RuntimeError(f"Failed to kill process '{pid}': {e}")
+ # Irrespective of the action, kill the processes to ensure clean state
+ log.info(f"Kill the processes for {participant.name} if running to avoid conflicts")
+ participant.kill_process()
+ if action == "stop":
+ log.info(f"Stopped {participant.name} successfully")
else:
try:
- log.info(f"Starting participant {participant.name}")
participant.start()
+ log.info(f"Started {participant.name} successfully")
except Exception as e:
- raise ex.ParticipantStartException(f"Error starting participant: {e}")
+ raise ex.ParticipantStartException(f"Error starting participant {participant.name}: {e}")
return True
+
+
+def get_pids_for_active_command(command):
+ """
+ Get the process IDs of the given command if it is running.
+
+ Args:
+ command (str): The command to check.
+
+ Returns:
+ list: List of process IDs if the command is running, otherwise an empty list.
+ """
+ pids = []
+ for proc in psutil.process_iter(['pid', 'cmdline']):
+ try:
+ cmdline = proc.info['cmdline']
+ if isinstance(cmdline, list):
+ cmdline = ' '.join(cmdline)
+ if command in cmdline:
+ pids.append(proc.info['pid'])
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+ continue
+ return pids
+
+
+def kill_processes(command_to_kill, fail_if_not_found=False):
+ """
+ Kill all processes for the given command.
+
+ Args:
+ command_to_kill (str): The command to kill.
+ fail_if_not_found (bool): Fail if given process is not found.
+
+ Returns:
+ bool: True if processes were killed, False otherwise.
+ """
+ try:
+ pids = get_pids_for_active_command(command_to_kill)
+ log.info(f"PIDs for command '{command_to_kill}': {pids}")
+ # Kill each process
+ for pid in pids:
+ subprocess.run(['sudo', 'kill', '-9', str(pid)], check=fail_if_not_found)
+ log.info(f"Killed process with PID {pid}")
+ return True
+ except subprocess.CalledProcessError:
+ if fail_if_not_found:
+ raise RuntimeError(f"Failed to kill process with PID {pid}")
+ return False
diff --git a/tests/end_to_end/utils/ssh_helper.py b/tests/end_to_end/utils/ssh_helper.py
index 2bd7c34ddf..3ccb97730f 100644
--- a/tests/end_to_end/utils/ssh_helper.py
+++ b/tests/end_to_end/utils/ssh_helper.py
@@ -1,4 +1,4 @@
-# Copyright 2020-2023 Intel Corporation
+# Copyright 2020-2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import subprocess
@@ -14,7 +14,7 @@
def run_command_background(
- cmd, return_error=False, print_stdout=False, work_dir=None, redirect_to_file=None, check_sleep=1
+ cmd, return_error=False, print_stdout=False, work_dir=None, redirect_to_file=None, check_sleep=1, env=None
):
"""Execute a command and let it run in background.
@@ -30,9 +30,10 @@ def run_command_background(
redirect_to_file: The file descriptor to which the STDERR and STDOUT will be written.
check_sleep: Time in seconds to sleep before polling to make sure
the background process is still running.
+ env: Environment variables to set for the command.
Returns:
- Popen object of the subprocess. None, if the command completed immediately.
+ Popen object of the subprocess.
"""
if isinstance(cmd, list):
shell = False
@@ -46,7 +47,7 @@ def run_command_background(
output_redirect = subprocess.PIPE
error_redirect = subprocess.PIPE
process = subprocess.Popen(
- cmd, stdout=output_redirect, stderr=error_redirect, shell=shell, text=True, cwd=work_dir
+ cmd, stdout=output_redirect, stderr=error_redirect, shell=shell, text=True, cwd=work_dir, env=env
)
time.sleep(check_sleep)
return_code = process.poll()
@@ -72,7 +73,8 @@ def run_command_background(
output = process.stdout.read().rstrip("\n").split("\n")
if print_stdout and output is not None:
log.info(f"Command to run - {cmd} output - {output}")
- return None
+
+ return process
def run_command(
diff --git a/tests/end_to_end/utils/tr_workspace.py b/tests/end_to_end/utils/tr_workspace.py
index 5fba9b357b..10715fa276 100644
--- a/tests/end_to_end/utils/tr_workspace.py
+++ b/tests/end_to_end/utils/tr_workspace.py
@@ -44,6 +44,13 @@ def common_workspace_creation(request, eval_scope=False):
agg_workspace_path = constants.AGG_WORKSPACE_PATH.format(workspace_path)
+ # For Flower App Pytorch, num of rounds must be 1
+ if request.config.model_name.lower() == constants.ModelName.FLOWER_APP_PYTORCH.value:
+ if request.config.num_rounds != 1:
+ raise ex.FlowerAppException(
+ "Flower app with PyTorch only supports 1 round of training."
+ )
+
# Create model owner object and the workspace for the model
# Workspace name will be same as the model name
model_owner = mo_model.ModelOwner(
@@ -54,7 +61,6 @@ def common_workspace_creation(request, eval_scope=False):
fh.create_persistent_store(model_owner.name, local_bind_path)
model_owner.create_workspace()
- fh.add_local_workspace_permission(local_bind_path)
# Modify the plan
plan_path = constants.AGG_PLAN_PATH.format(local_bind_path)
@@ -142,14 +148,14 @@ def create_tr_workspace(request, eval_scope=False):
# Data setup requires total no of collaborators, thus keeping the function call
# outside of the loop
- if request.config.model_name.lower() == constants.ModelName.XGB_HIGGS.value:
+ if request.config.model_name.lower() in [constants.ModelName.XGB_HIGGS.value, constants.ModelName.FLOWER_APP_PYTORCH.value]:
fh.setup_collaborator_data(collaborators, request.config.model_name, local_bind_path)
if request.config.use_tls:
fh.setup_pki_for_collaborators(collaborators, model_owner, local_bind_path)
fh.import_pki_for_collaborators(collaborators)
- fh.remove_stale_processes(request.config.num_collaborators)
+ fh.remove_stale_processes(aggregator, collaborators)
# Return the federation fixture
return federation_details(
@@ -294,7 +300,6 @@ def create_tr_dws_workspace(request, eval_scope=False):
# Command 'fx workspace dockerize --save ..' will use the workspace name for
# image name which is 'workspace' in this case.
model_owner.dockerize_workspace(constants.DEFAULT_OPENFL_IMAGE)
- image_name = constants.DFLT_DOCKERIZE_IMAGE_NAME
# Certify the workspace in case of TLS
if request.config.use_tls:
@@ -331,14 +336,14 @@ def create_tr_dws_workspace(request, eval_scope=False):
# Data setup requires total no of collaborators, thus keeping the function call
# outside of the loop
- if request.config.model_name.lower() == constants.ModelName.XGB_HIGGS.value:
+ if request.config.model_name.lower() in [constants.ModelName.XGB_HIGGS.value, constants.ModelName.FLOWER_APP_PYTORCH.value]:
fh.setup_collaborator_data(collaborators, request.config.model_name, local_bind_path)
# Note: In case of multiple machines setup, scp the created tar for collaborators
# to the other machine(s)
fh.create_tarball_for_collaborators(
collaborators, local_bind_path, use_tls=request.config.use_tls,
- add_data=True if request.config.model_name.lower() == constants.ModelName.XGB_HIGGS.value else False
+ add_data=True if request.config.model_name.lower() in [constants.ModelName.XGB_HIGGS.value, constants.ModelName.FLOWER_APP_PYTORCH.value] else False
)
# Generate the sign request and certify the aggregator in case of TLS
@@ -357,7 +362,7 @@ def create_tr_dws_workspace(request, eval_scope=False):
# Note: In case of multiple machines setup, scp this workspace tar
# to the other machine(s) so that docker load can load the image.
- model_owner.load_workspace(workspace_tar_name=f"{image_name}.tar")
+ model_owner.load_workspace(workspace_tar_name=f"{constants.DFLT_WORKSPACE_NAME}.tar")
# Return the federation fixture
return federation_details(
diff --git a/tests/openfl/component/aggregator/test_aggregator.py b/tests/openfl/component/aggregator/test_aggregator.py
index f9883fd7b7..342a218fe3 100644
--- a/tests/openfl/component/aggregator/test_aggregator.py
+++ b/tests/openfl/component/aggregator/test_aggregator.py
@@ -137,7 +137,6 @@ def test_get_tasks(agg, col_name, tasks, time_to_quit,
def test_get_aggregated_tensor(agg):
"""Test that test_get_tasks is failed without a correspond data."""
- collaborator_name = 'col1'
tensor_name = 'test_tensor_name'
require_lossless = False
round_number = 0
@@ -145,7 +144,7 @@ def test_get_aggregated_tensor(agg):
tags = ['compressed']
with pytest.raises(ValueError):
agg.get_aggregated_tensor(
- collaborator_name, tensor_name, round_number, report, tags, require_lossless)
+ tensor_name, round_number, report, tags, require_lossless)
def test_collaborator_task_completed_none(agg):
diff --git a/tests/openfl/component/collaborator/test_collaborator.py b/tests/openfl/component/collaborator/test_collaborator.py
index 49288c2ffe..6157192293 100644
--- a/tests/openfl/component/collaborator/test_collaborator.py
+++ b/tests/openfl/component/collaborator/test_collaborator.py
@@ -8,16 +8,20 @@
import pytest
from openfl.component.collaborator import Collaborator
+from openfl.pipelines import NoCompressionPipeline
from openfl.protocols import base_pb2
+from openfl.transport.serialiser import CollaboratorSerialiser
from openfl.utilities.types import TensorKey
-
@pytest.fixture
def collaborator_mock():
"""Initialize the collaborator mock."""
col = Collaborator('col1', 'some_uuid', 'federation_uuid',
mock.Mock(), mock.Mock(), mock.Mock(), opt_treatment='RESET')
col.tensor_db = mock.Mock()
+ col._serialisation_middleware = CollaboratorSerialiser(
+ col.collaborator_name, mock.Mock(), NoCompressionPipeline()
+ )
return col
@@ -93,33 +97,17 @@ def test_do_task(collaborator_mock, tensor_key):
collaborator_mock.send_task_results.assert_called_with(result[0], round_number, task.name)
-def test_send_task_results(collaborator_mock, tensor_key):
- """Test that send_task_results works correctly."""
- task_name = 'task_name'
- tensor_key = tensor_key._replace(report=True)
- tensor_dict = {tensor_key: 0}
- round_number = 0
- data_size = -1
- collaborator_mock.nparray_to_named_tensor = mock.Mock(return_value=None)
- collaborator_mock.client.send_local_task_results = mock.Mock()
- collaborator_mock.send_task_results(tensor_dict, round_number, task_name)
-
- collaborator_mock.client.send_local_task_results.assert_called_with(
- round_number, task_name, data_size, [None])
-
-
def test_send_task_results_train(collaborator_mock):
"""Test that send_task_results for train tasks works correctly."""
task_name = 'train_task'
tensor_dict = {}
round_number = 0
data_size = 200
- collaborator_mock.nparray_to_named_tensor = mock.Mock()
collaborator_mock.task_runner.get_train_data_size = mock.Mock(return_value=data_size)
- collaborator_mock.client.send_local_task_results = mock.Mock()
+ collaborator_mock._serialisation_middleware._aggregator_client.send_local_task_results = mock.Mock()
collaborator_mock.send_task_results(tensor_dict, round_number, task_name)
- collaborator_mock.client.send_local_task_results.assert_called_with(
+ collaborator_mock._serialisation_middleware._aggregator_client.send_local_task_results.assert_called_with(
round_number, task_name, data_size, [])
@@ -129,61 +117,23 @@ def test_send_task_results_valid(collaborator_mock):
tensor_dict = {}
round_number = 0
data_size = 400
- collaborator_mock.nparray_to_named_tensor = mock.Mock()
collaborator_mock.task_runner.get_valid_data_size = mock.Mock(return_value=data_size)
- collaborator_mock.client.send_local_task_results = mock.Mock()
+ collaborator_mock._serialisation_middleware._aggregator_client.send_local_task_results = mock.Mock()
collaborator_mock.send_task_results(tensor_dict, round_number, task_name)
- collaborator_mock.client.send_local_task_results.assert_called_with(
+ collaborator_mock._serialisation_middleware._aggregator_client.send_local_task_results.assert_called_with(
round_number, task_name, data_size, [])
-def test_named_tensor_to_nparray_without_tags(collaborator_mock, named_tensor):
- """Test that named_tensor_to_nparray works correctly for tensor without tags."""
- nparray = collaborator_mock.named_tensor_to_nparray(named_tensor)
-
- assert named_tensor.data_bytes == nparray
-
-
-@pytest.mark.parametrize('tag', ['compressed', 'lossy_compressed'])
-def test_named_tensor_to_nparray_compressed_tag(collaborator_mock, named_tensor, tag):
- """Test that named_tensor_to_nparray works correctly for tensor with tags."""
- named_tensor.tags.append(tag)
- nparray = collaborator_mock.named_tensor_to_nparray(named_tensor)
-
- assert isinstance(nparray, numpy.ndarray)
-
-
-def test_nparray_to_named_tensor(collaborator_mock, tensor_key, named_tensor):
- """Test that nparray_to_named_tensor works correctly."""
- named_tensor.tags.append('compressed')
- nparray = collaborator_mock.named_tensor_to_nparray(named_tensor)
- tensor = collaborator_mock.nparray_to_named_tensor(tensor_key, nparray)
- assert tensor.data_bytes == named_tensor.data_bytes
- assert tensor.lossless is True
-
-
-def test_nparray_to_named_tensor_trained(collaborator_mock, tensor_key_trained, named_tensor):
- """Test that nparray_to_named_tensor works correctly for trained tensor."""
- named_tensor.tags.append('compressed')
- collaborator_mock.use_delta_updates = True
- nparray = collaborator_mock.named_tensor_to_nparray(named_tensor)
- collaborator_mock.tensor_db.get_tensor_from_cache = mock.Mock(
- return_value=nparray)
- tensor = collaborator_mock.nparray_to_named_tensor(tensor_key_trained, nparray)
- assert len(tensor.data_bytes) == 32
- assert tensor.lossless is False
- assert 'delta' in tensor.tags
-
@pytest.mark.parametrize('require_lossless', [True, False])
def test_get_aggregated_tensor_from_aggregator(collaborator_mock, tensor_key,
named_tensor, require_lossless):
"""Test that get_aggregated_tensor works correctly."""
- collaborator_mock.client.get_aggregated_tensor = mock.Mock(return_value=named_tensor)
+ collaborator_mock._serialisation_middleware._aggregator_client.get_aggregated_tensor = mock.Mock(return_value=named_tensor)
nparray = collaborator_mock.get_aggregated_tensor_from_aggregator(tensor_key, require_lossless)
- collaborator_mock.client.get_aggregated_tensor.assert_called_with(
+ collaborator_mock._serialisation_middleware._aggregator_client.get_aggregated_tensor.assert_called_with(
tensor_key.tensor_name, tensor_key.round_number,
tensor_key.report, tensor_key.tags, require_lossless)
assert nparray == named_tensor.data_bytes
@@ -225,8 +175,73 @@ def test_get_data_for_tensorkey_dependencies(collaborator_mock, tensor_key):
tensor_key = tensor_key._replace(round_number=1)
collaborator_mock.tensor_db.get_tensor_from_cache = mock.Mock(
return_value=None)
- collaborator_mock.tensor_codec.find_dependencies = mock.Mock(return_value=[tensor_key])
+ collaborator_mock._find_dependencies = mock.Mock(return_value=[tensor_key])
collaborator_mock.get_aggregated_tensor_from_aggregator = mock.Mock()
collaborator_mock.get_data_for_tensorkey(tensor_key)
collaborator_mock.get_aggregated_tensor_from_aggregator.assert_called_with(
tensor_key, require_lossless=True)
+
+def test_find_dependencies_without_send_model_deltas(collaborator_mock, tensor_key):
+ """Test that find_dependencies returns empty list when send_model_deltas = False."""
+ tensor_name, origin, _, report, _ = tensor_key
+ tensor_key = TensorKey(
+ tensor_name, origin, 5, report, ('model',)
+ )
+ tensor_key_dependencies = collaborator_mock._find_dependencies(tensor_key)
+
+ assert len(tensor_key_dependencies) == 0
+
+
+def test_find_dependencies_without_model_in_tags(collaborator_mock, tensor_key):
+ """Test that find_dependencies returns empty list when there is no model tag."""
+ collaborator_mock.use_delta_updates = True
+ tensor_key_dependencies = collaborator_mock._find_dependencies(tensor_key)
+
+ assert len(tensor_key_dependencies) == 0
+
+
+def test_find_dependencies_with_zero_round(collaborator_mock, tensor_key):
+ """Test that find_dependencies returns empty list when round number is 0."""
+ collaborator_mock.use_delta_updates = True
+ tensor_name, origin, round_number, report, tags = tensor_key
+ tensor_key = TensorKey(
+ tensor_name, origin, round_number, report, ('model',)
+ )
+ tensor_key_dependencies = collaborator_mock._find_dependencies(tensor_key)
+
+ assert len(tensor_key_dependencies) == 0
+
+
+# def test_find_dependencies(collaborator_mock, tensor_key):
+# """Test that find_dependencies works correctly."""
+# collaborator_mock.use_delta_updates = True
+# tensor_name, origin, round_number, report, tags = tensor_key
+# round_number = 2
+# tensor_key = TensorKey(
+# tensor_name, origin, round_number, report, ('model',)
+# )
+# tensor_key_dependencies = collaborator_mock._find_dependencies(tensor_key)
+
+# assert len(tensor_key_dependencies) == 2
+# tensor_key_dependency_0, tensor_key_dependency_1 = tensor_key_dependencies
+# assert tensor_key_dependency_0.round_number == round_number - 1
+# assert tensor_key_dependency_0.tags == tensor_key.tags
+# assert tensor_key_dependency_1.tags == ('aggregated', 'delta', 'compressed')
+
+
+# def test_find_dependencies_is_lossy(collaborator_mock, tensor_key):
+# """Test that find_dependencies works correctly with lossy_compressed."""
+# collaborator_mock.use_delta_updates = True
+# collaborator_mock.compression_pipeline.is_lossy = mock.Mock(return_value=True)
+# tensor_name, origin, round_number, report, tags = tensor_key
+# round_number = 2
+# tensor_key = TensorKey(
+# tensor_name, origin, round_number, report, ('model',)
+# )
+# tensor_key_dependencies = collaborator_mock._find_dependencies(tensor_key)
+
+# assert len(tensor_key_dependencies) == 2
+# tensor_key_dependency_0, tensor_key_dependency_1 = tensor_key_dependencies
+# assert tensor_key_dependency_0.round_number == round_number - 1
+# assert tensor_key_dependency_0.tags == tensor_key.tags
+# assert tensor_key_dependency_1.tags == ('aggregated', 'delta', 'lossy_compressed')
diff --git a/tests/openfl/databases/test_tensor_db.py b/tests/openfl/databases/test_tensor_db.py
index ba978ca0c1..892df37cfa 100644
--- a/tests/openfl/databases/test_tensor_db.py
+++ b/tests/openfl/databases/test_tensor_db.py
@@ -215,7 +215,7 @@ def test_get_aggregated_tensor_weights(tensor_db):
axis=0
)
- assert np.array_equal(agg_nparray, control_nparray)
+ assert np.allclose(agg_nparray, control_nparray)
def test_get_aggregated_tensor_error_aggregation_function(tensor_db):
diff --git a/tests/openfl/pipelines/test_tensor_codec.py b/tests/openfl/pipelines/test_tensor_codec.py
index 5ff1b6c704..ce7c3b6b40 100644
--- a/tests/openfl/pipelines/test_tensor_codec.py
+++ b/tests/openfl/pipelines/test_tensor_codec.py
@@ -46,6 +46,21 @@ def tensor_key(named_tensor):
return tensor_key
+@pytest.fixture
+def tensor_key_trained(named_tensor):
+ """Initialize the tensor_key_trained mock."""
+ named_tensor.tags.append('trained')
+ # named_tensor.tags.remove('model')
+ tensor_key = TensorKey(
+ named_tensor.name,
+ 'col1',
+ named_tensor.round_number,
+ named_tensor.report,
+ tuple(named_tensor.tags)
+ )
+ return tensor_key
+
+
def test_compress(tensor_key, named_tensor):
"""Test that compress works correctly."""
tensor_codec = TensorCodec(NoCompressionPipeline())
@@ -239,153 +254,30 @@ def test_decompress_compressed_in_tags(tensor_key, named_tensor):
assert 'compressed' not in decompressed_tensor_key.tags
-def test_generate(tensor_key, named_tensor):
- """Test that generate_delta works correctly."""
- tensor_codec = TensorCodec(NoCompressionPipeline())
- metadata = [{'int_to_float': proto.int_to_float,
- 'int_list': proto.int_list,
- 'bool_list': proto.bool_list
- } for proto in named_tensor.transformer_metadata]
- array_shape = tuple(metadata[0]['int_list'])
- flat_array = np.frombuffer(named_tensor.data_bytes, dtype=np.float32)
-
- nparray = np.reshape(flat_array, newshape=array_shape, order='C')
-
- delta_tensor_key, delta_nparray = tensor_codec.generate_delta(tensor_key, nparray, nparray)
-
- assert np.array_equal(delta_nparray, nparray - nparray)
- assert 'delta' in delta_tensor_key.tags
-
-
-def test_generate_delta_assert_model_in_tags(tensor_key, named_tensor):
- """Test that generate_delta raises exception when there is model tag."""
- tensor_codec = TensorCodec(NoCompressionPipeline())
- tensor_name, origin, round_number, report, tags = tensor_key
- tensor_key = TensorKey(
- tensor_name, origin, round_number, report, ('model',)
- )
- metadata = [{'int_to_float': proto.int_to_float,
- 'int_list': proto.int_list,
- 'bool_list': proto.bool_list
- } for proto in named_tensor.transformer_metadata]
- array_shape = tuple(metadata[0]['int_list'])
- flat_array = np.frombuffer(named_tensor.data_bytes, dtype=np.float32)
-
- nparray = np.reshape(flat_array, newshape=array_shape, order='C')
-
- with pytest.raises(AssertionError):
- tensor_codec.generate_delta(tensor_key, nparray, nparray)
-
-
-def test_apply_delta_agg(tensor_key, named_tensor):
- """Test that apply_delta works for aggregator tensor_key."""
- tensor_codec = TensorCodec(NoCompressionPipeline())
- tensor_name, origin, round_number, report, tags = tensor_key
- tensor_key = TensorKey(
- tensor_name, 'aggregator_1', round_number, report, ('delta',)
- )
- metadata = [{'int_to_float': proto.int_to_float,
- 'int_list': proto.int_list,
- 'bool_list': proto.bool_list
- } for proto in named_tensor.transformer_metadata]
- array_shape = tuple(metadata[0]['int_list'])
- flat_array = np.frombuffer(named_tensor.data_bytes, dtype=np.float32)
-
- nparray = np.reshape(flat_array, newshape=array_shape, order='C')
-
- new_model_tensor_key, nparray_with_delta = tensor_codec.apply_delta(
- tensor_key, nparray, nparray)
-
- assert 'delta' not in new_model_tensor_key.tags
- assert np.array_equal(nparray_with_delta, nparray + nparray)
-
-
-def test_apply_delta_col(tensor_key, named_tensor):
- """Test that apply_delta works for collaborator tensor_key."""
- tensor_codec = TensorCodec(NoCompressionPipeline())
- tensor_name, origin, round_number, report, tags = tensor_key
- tensor_key = TensorKey(
- tensor_name, origin, round_number, report, ('delta',)
- )
- metadata = [{'int_to_float': proto.int_to_float,
- 'int_list': proto.int_list,
- 'bool_list': proto.bool_list
- } for proto in named_tensor.transformer_metadata]
- array_shape = tuple(metadata[0]['int_list'])
- flat_array = np.frombuffer(named_tensor.data_bytes, dtype=np.float32)
-
- nparray = np.reshape(flat_array, newshape=array_shape, order='C')
-
- new_model_tensor_key, nparray_with_delta = tensor_codec.apply_delta(
- tensor_key, nparray, nparray)
-
- assert 'model' in new_model_tensor_key.tags
- assert 'delta' not in new_model_tensor_key.tags
- assert np.array_equal(nparray_with_delta, nparray + nparray)
-
-
-def test_find_dependencies_without_send_model_deltas(tensor_key):
- """Test that find_dependencies returns empty list when send_model_deltas = False."""
+def test_deserialise_without_tags(named_tensor):
+ """Test that deserialise works correctly for tensor without tags."""
tensor_codec = TensorCodec(NoCompressionPipeline())
- tensor_name, origin, round_number, report, tags = tensor_key
- tensor_key = TensorKey(
- tensor_name, origin, 5, report, ('model',)
- )
- tensor_key_dependencies = tensor_codec.find_dependencies(tensor_key, False)
+ _, nparray = tensor_codec.deserialise(named_tensor, 'col1')
- assert len(tensor_key_dependencies) == 0
+ assert named_tensor.data_bytes == nparray
-def test_find_dependencies_without_model_in_tags(tensor_key):
- """Test that find_dependencies returns empty list when there is no model tag."""
+@pytest.mark.parametrize('tag', ['compressed', 'lossy_compressed'])
+def test_deserialise_compressed_tag(named_tensor, tag):
+ """Test that deserialise works correctly for tensor with tags."""
+ named_tensor.tags.append(tag)
tensor_codec = TensorCodec(NoCompressionPipeline())
- tensor_key_dependencies = tensor_codec.find_dependencies(tensor_key, True)
+ _, nparray = tensor_codec.deserialise(named_tensor, 'col1')
- assert len(tensor_key_dependencies) == 0
+ assert isinstance(nparray, np.ndarray)
-def test_find_dependencies_with_zero_round(tensor_key):
- """Test that find_dependencies returns empty list when round number is 0."""
+def test_serialise(tensor_key, named_tensor):
+ """Test that serialise works correctly."""
+ named_tensor.tags.append('compressed')
tensor_codec = TensorCodec(NoCompressionPipeline())
- tensor_name, origin, round_number, report, tags = tensor_key
- tensor_key = TensorKey(
- tensor_name, origin, round_number, report, ('model',)
- )
- tensor_key_dependencies = tensor_codec.find_dependencies(tensor_key, True)
-
- assert len(tensor_key_dependencies) == 0
-
-
-def test_find_dependencies(tensor_key):
- """Test that find_dependencies works correctly."""
- tensor_codec = TensorCodec(NoCompressionPipeline())
- tensor_name, origin, round_number, report, tags = tensor_key
- round_number = 2
- tensor_key = TensorKey(
- tensor_name, origin, round_number, report, ('model',)
- )
- tensor_key_dependencies = tensor_codec.find_dependencies(tensor_key, True)
-
- assert len(tensor_key_dependencies) == 2
- tensor_key_dependency_0, tensor_key_dependency_1 = tensor_key_dependencies
- assert tensor_key_dependency_0.round_number == round_number - 1
- assert tensor_key_dependency_0.tags == tensor_key.tags
- assert tensor_key_dependency_1.tags == ('aggregated', 'delta', 'compressed')
-
-
-def test_find_dependencies_is_lossy(tensor_key):
- """Test that find_dependencies works correctly with lossy_compressed."""
- tensor_codec = TensorCodec(NoCompressionPipeline())
- tensor_codec.compression_pipeline.is_lossy = mock.Mock(return_value=True)
- tensor_name, origin, round_number, report, tags = tensor_key
- round_number = 2
- tensor_key = TensorKey(
- tensor_name, origin, round_number, report, ('model',)
- )
- tensor_key_dependencies = tensor_codec.find_dependencies(tensor_key, True)
+ _, nparray = tensor_codec.deserialise(named_tensor, 'col1')
+ tensor = tensor_codec.serialise(tensor_key, nparray)
- assert len(tensor_key_dependencies) == 2
- tensor_key_dependency_0, tensor_key_dependency_1 = tensor_key_dependencies
- assert tensor_key_dependency_0.round_number == round_number - 1
- assert tensor_key_dependency_0.tags == tensor_key.tags
- assert tensor_key_dependency_1.tags == ('aggregated', 'delta', 'lossy_compressed')
+ assert tensor.data_bytes == named_tensor.data_bytes
+ assert tensor.lossless is True
diff --git a/tests/openfl/utilities/test_delta.py b/tests/openfl/utilities/test_delta.py
new file mode 100644
index 0000000000..dd46e22064
--- /dev/null
+++ b/tests/openfl/utilities/test_delta.py
@@ -0,0 +1,105 @@
+# Copyright (C) 2020-2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+"""Tensor codec tests module."""
+
+
+pass
+
+import numpy as np
+import pytest
+
+from openfl.protocols import base_pb2
+from openfl.utilities.utils import apply_delta, generate_delta
+from openfl.utilities.types import TensorKey
+
+
+@pytest.fixture
+def named_tensor():
+ """Initialize the named_tensor mock."""
+ tensor = base_pb2.NamedTensor(
+ name='tensor_name',
+ round_number=0,
+ lossless=False,
+ report=False,
+ data_bytes=32 * b'1'
+ )
+ metadata = tensor.transformer_metadata.add()
+ metadata.int_to_float[1] = 1.
+ metadata.int_list.extend([1, 8])
+ metadata.bool_list.append(True)
+
+ return tensor
+
+@pytest.fixture
+def tensor_key(named_tensor):
+ """Initialize the tensor_key mock."""
+ tensor_key = TensorKey(
+ named_tensor.name,
+ 'col1',
+ named_tensor.round_number,
+ named_tensor.report,
+ tuple(named_tensor.tags)
+ )
+ return tensor_key
+
+
+def test_generate(tensor_key, named_tensor):
+ """Test that generate_delta works correctly."""
+ metadata = [{'int_to_float': proto.int_to_float,
+ 'int_list': proto.int_list,
+ 'bool_list': proto.bool_list
+ } for proto in named_tensor.transformer_metadata]
+ array_shape = tuple(metadata[0]['int_list'])
+ flat_array = np.frombuffer(named_tensor.data_bytes, dtype=np.float32)
+
+ nparray = np.reshape(flat_array, newshape=array_shape, order='C')
+
+ delta_tensor_key, delta_nparray = generate_delta(tensor_key, nparray, nparray)
+
+ assert np.array_equal(delta_nparray, nparray - nparray)
+ assert 'delta' in delta_tensor_key.tags
+
+
+def test_apply_delta_agg(tensor_key, named_tensor):
+ """Test that apply_delta works for aggregator tensor_key."""
+ tensor_name, origin, round_number, report, tags = tensor_key
+ tensor_key = TensorKey(
+ tensor_name, 'aggregator_1', round_number, report, ('delta',)
+ )
+ metadata = [{'int_to_float': proto.int_to_float,
+ 'int_list': proto.int_list,
+ 'bool_list': proto.bool_list
+ } for proto in named_tensor.transformer_metadata]
+ array_shape = tuple(metadata[0]['int_list'])
+ flat_array = np.frombuffer(named_tensor.data_bytes, dtype=np.float32)
+
+ nparray = np.reshape(flat_array, newshape=array_shape, order='C')
+
+ new_model_tensor_key, nparray_with_delta = apply_delta(
+ tensor_key, nparray, nparray)
+
+ assert 'delta' not in new_model_tensor_key.tags
+ assert np.array_equal(nparray_with_delta, nparray + nparray)
+
+
+def test_apply_delta_col(tensor_key, named_tensor):
+ """Test that apply_delta works for collaborator tensor_key."""
+ tensor_name, origin, round_number, report, tags = tensor_key
+ tensor_key = TensorKey(
+ tensor_name, origin, round_number, report, ('delta',)
+ )
+ metadata = [{'int_to_float': proto.int_to_float,
+ 'int_list': proto.int_list,
+ 'bool_list': proto.bool_list
+ } for proto in named_tensor.transformer_metadata]
+ array_shape = tuple(metadata[0]['int_list'])
+ flat_array = np.frombuffer(named_tensor.data_bytes, dtype=np.float32)
+
+ nparray = np.reshape(flat_array, newshape=array_shape, order='C')
+
+ new_model_tensor_key, nparray_with_delta = apply_delta(
+ tensor_key, nparray, nparray)
+
+ assert 'model' in new_model_tensor_key.tags
+ assert 'delta' not in new_model_tensor_key.tags
+ assert np.array_equal(nparray_with_delta, nparray + nparray)