diff --git a/.builders/images/runner_dependencies.txt b/.builders/images/runner_dependencies.txt index 3c3b0f9b9b352..18295cc266e2e 100644 --- a/.builders/images/runner_dependencies.txt +++ b/.builders/images/runner_dependencies.txt @@ -3,3 +3,5 @@ urllib3==2.2.0 auditwheel==6.0.0; sys_platform == 'linux' delvewheel==1.5.2; sys_platform == 'win32' delocate==0.13.0; sys_platform == 'darwin' +wheel==0.45.1 +pathspec==0.12.1 \ No newline at end of file diff --git a/.builders/scripts/build_backend.py b/.builders/scripts/build_backend.py new file mode 100644 index 0000000000000..54a467ddfab0d --- /dev/null +++ b/.builders/scripts/build_backend.py @@ -0,0 +1,96 @@ +import inspect +import shutil +import sys +import tomllib +import zipfile +from functools import cache +from pathlib import Path + +import pathspec +from setuptools import build_meta as _orig + + +def remove_test_files(wheel_path: Path) -> None: + """ + Remove excluded files and directories from a built wheel. + Prints the number of files removed. + """ + tmp_wheel = wheel_path.with_suffix(".tmp.whl") + removed_count = 0 + + with ( + zipfile.ZipFile(wheel_path, "r") as zin, + zipfile.ZipFile(tmp_wheel, "w", compression=zipfile.ZIP_DEFLATED) as zout, + ): + for info in zin.infolist(): + rel = info.filename + if is_excluded_from_wheel(rel): + removed_count += 1 + continue # skip excluded file or directory + + data = zin.read(rel) + zout.writestr(info, data) + + shutil.move(tmp_wheel, wheel_path) + print(f"Removed {removed_count} files from {wheel_path.name}") + + +def is_excluded_from_wheel(path: str | Path) -> bool: + """ + Return True if `path` (file or directory) should be excluded per files_to_remove.toml. + Matches: + - type annotation files: **/*.pyi, **/py.typed + - test directories listed with a trailing '/' + """ + spec = _load_excluded_spec() + rel = Path(path).as_posix() + + if spec.match_file(rel) or spec.match_file(rel + "/"): + return True + + return False + + +@cache +def _load_excluded_spec() -> pathspec.PathSpec: + """ + Load excluded paths from files_to_remove.toml and compile them + with .gitignore-style semantics. + """ + config_path = Path(__file__).parent / "files_to_remove.toml" + with open(config_path, "rb") as f: + config = tomllib.load(f) + + patterns = config.get("excluded_paths", []) + return pathspec.PathSpec.from_lines("gitignore", patterns) + + +def build_wheel(wheel_directory, config_settings=None, metadata_directory=None): + """Intercept wheel building to strip test files.""" + wheel_file = _orig.build_wheel(wheel_directory, config_settings, metadata_directory) + + # Post-process the wheel to remove tests + wheel_path = Path(wheel_directory) / wheel_file + remove_test_files(wheel_path) + + return wheel_file + + +# Proxy all other PEP 517 hooks +# prepare_metadata_for_build_wheel = _orig.prepare_metadata_for_build_wheel +# build_sdist = _orig.build_sdist +# (better do by iterating over _orig methods instead) +print("-> Inspecting _orig methods") +for name, func in inspect.getmembers(_orig, inspect.isfunction): + # Only copy methods if they haven't been defined in the current module + # (i.e., don't overwrite your custom build_wheel) + print("Name: ", name, "Func: ", func, "Is in globals: ", name in globals()) + if name not in globals(): + globals()[name] = func + print("Added to globals: ", name) + +# for name in dir(_orig): +# # Check if the attribute name is a PEP 517 hook and not one we defined/overrode +# if name.startswith('build_') or 'requires_for' in name: +# if name not in globals(): +# setattr(sys.modules[__name__], name, getattr(_orig, name)) diff --git a/.builders/scripts/build_wheels.py b/.builders/scripts/build_wheels.py index 8108b84a3ff5e..cec4dcdd21491 100644 --- a/.builders/scripts/build_wheels.py +++ b/.builders/scripts/build_wheels.py @@ -1,26 +1,38 @@ from __future__ import annotations import argparse +import email import json import os +import re +import shutil import subprocess import sys +import time +import tomllib +from functools import cache +from hashlib import sha256 from pathlib import Path from tempfile import TemporaryDirectory from typing import TypedDict from zipfile import ZipFile +import pathspec +import urllib3 from dotenv import dotenv_values -from utils import extract_metadata, normalize_project_name +from utils import iter_wheels INDEX_BASE_URL = 'https://agent-int-packages.datadoghq.com' CUSTOM_EXTERNAL_INDEX = f'{INDEX_BASE_URL}/external' CUSTOM_BUILT_INDEX = f'{INDEX_BASE_URL}/built' +UNNORMALIZED_PROJECT_NAME_CHARS = re.compile(r'[-_.]+') + class WheelSizes(TypedDict): compressed: int uncompressed: int + if sys.platform == 'win32': PY3_PATH = Path('C:\\py3\\Scripts\\python.exe') PY2_PATH = Path('C:\\py2\\Scripts\\python.exe') @@ -62,6 +74,83 @@ def check_process(*args, **kwargs) -> subprocess.CompletedProcess: return process +def extract_metadata(wheel: Path) -> email.Message: + with ZipFile(str(wheel)) as zip_archive: + for path in zip_archive.namelist(): + root = path.split('/', 1)[0] + if root.endswith('.dist-info'): + dist_info_dir = root + break + else: + message = f'Could not find the `.dist-info` directory in wheel: {wheel.name}' + raise RuntimeError(message) + + try: + with zip_archive.open(f'{dist_info_dir}/METADATA') as zip_file: + metadata_file_contents = zip_file.read().decode('utf-8') + except KeyError: + message = f'Could not find a `METADATA` file in the `{dist_info_dir}` directory' + raise RuntimeError(message) from None + + return email.message_from_string(metadata_file_contents) + + +def normalize_project_name(name: str) -> str: + # https://peps.python.org/pep-0503/#normalized-names + return UNNORMALIZED_PROJECT_NAME_CHARS.sub('-', name).lower() + + +@cache +def get_wheel_hashes(project) -> dict[str, str]: + retry_wait = 2 + while True: + try: + response = urllib3.request( + 'GET', + f'https://pypi.org/simple/{project}', + headers={"Accept": "application/vnd.pypi.simple.v1+json"}, + ) + except urllib3.exceptions.HTTPError as e: + err_msg = f'Failed to fetch hashes for `{project}`: {e}' + else: + if response.status == 200: + break + + err_msg = f'Failed to fetch hashes for `{project}`, status code: {response.status}' + + print(err_msg) + print(f'Retrying in {retry_wait} seconds') + time.sleep(retry_wait) + retry_wait *= 2 + continue + + data = response.json() + return { + file['filename']: file['hashes']['sha256'] + for file in data['files'] + if file['filename'].endswith('.whl') and 'sha256' in file['hashes'] + } + + +def wheel_was_built(wheel: Path) -> bool: + project_metadata = extract_metadata(wheel) + project_name = normalize_project_name(project_metadata['Name']) + wheel_hashes = get_wheel_hashes(project_name) + if wheel.name not in wheel_hashes: + return True + + file_hash = sha256(wheel.read_bytes()).hexdigest() + return file_hash != wheel_hashes[wheel.name] + + +def add_dependency(dependencies: dict[str, str], sizes: dict[str, WheelSizes], wheel: Path) -> None: + project_metadata = extract_metadata(wheel) + project_name = normalize_project_name(project_metadata['Name']) + project_version = project_metadata['Version'] + dependencies[project_name] = project_version + sizes[project_name] = {'version': project_version, **calculate_wheel_sizes(wheel)} + + def calculate_wheel_sizes(wheel_path: Path) -> WheelSizes: compressed_size = wheel_path.stat(follow_symlinks=True).st_size with ZipFile(wheel_path) as zf: @@ -92,6 +181,13 @@ def main(): with TemporaryDirectory() as d: staged_wheel_dir = Path(d).resolve() + staged_built_wheels_dir = staged_wheel_dir / 'built' + staged_external_wheels_dir = staged_wheel_dir / 'external' + + # Create the directories + staged_built_wheels_dir.mkdir(parents=True, exist_ok=True) + staged_external_wheels_dir.mkdir(parents=True, exist_ok=True) + env_vars = dict(os.environ) env_vars['PATH'] = f'{python_path.parent}{os.pathsep}{env_vars["PATH"]}' env_vars['PIP_WHEEL_DIR'] = str(staged_wheel_dir) @@ -121,26 +217,35 @@ def main(): if constraints_file := env_vars.get('PIP_CONSTRAINT'): env_vars['PIP_CONSTRAINT'] = path_to_uri(constraints_file) + print("--------------------------------") + print("Building wheels") + print("--------------------------------") # Fetch or build wheels command_args = [ str(python_path), '-m', 'pip', 'wheel', + '--config-settings', + f'--build-backend={MOUNT_DIR / "scripts" / "build_backend.py"}', '-r', str(MOUNT_DIR / 'requirements.in'), '--wheel-dir', str(staged_wheel_dir), - # Temporarily removing extra index urls. See below. - # '--extra-index-url', CUSTOM_EXTERNAL_INDEX, + # '--extra-index-url', + # CUSTOM_EXTERNAL_INDEX, ] - # Temporarily disable extra index urls. There are broken wheels in the gcloud bucket - # while working on removing tests from them. Adding extra indices causes undefined behavior - # and can pull a broken image, preventing the building from running. - # if args.use_built_index: - # command_args.extend(['--extra-index-url', CUSTOM_BUILT_INDEX]) check_process(command_args, env=env_vars) + print("--------------------------------") + print("Finished building wheels") + print("--------------------------------") + # Classify wheels + for wheel in iter_wheels(staged_wheel_dir): + if wheel_was_built(wheel): + shutil.move(wheel, staged_built_wheels_dir) + else: + shutil.move(wheel, staged_external_wheels_dir) # Repair wheels check_process( @@ -148,8 +253,10 @@ def main(): sys.executable, '-u', str(MOUNT_DIR / 'scripts' / 'repair_wheels.py'), - '--source-dir', - str(staged_wheel_dir), + '--source-built-dir', + str(staged_built_wheels_dir), + '--source-external-dir', + str(staged_external_wheels_dir), '--built-dir', str(built_wheels_dir), '--external-dir', @@ -166,8 +273,6 @@ def main(): project_name = normalize_project_name(project_metadata['Name']) project_version = project_metadata['Version'] dependencies[project_name] = project_version - - sizes[project_name] = {'version': project_version, **calculate_wheel_sizes(wheel)} output_path = MOUNT_DIR / 'sizes.json' diff --git a/.builders/scripts/files_to_remove.toml b/.builders/scripts/files_to_remove.toml new file mode 100644 index 0000000000000..f60a460f27010 --- /dev/null +++ b/.builders/scripts/files_to_remove.toml @@ -0,0 +1,45 @@ +excluded_paths = [ + # --- Type annotation --- + "krb5/**/*.pyi", + "krb5/**/py.typed", + + "Cryptodome/**/*.pyi", + "Cryptodome/**/py.typed", + + "ddtrace/**/*.pyi", + "ddtrace/**/py.typed", + + "pyVmomi/**/*.pyi", + "pyVmomi/**/py.typed", + + "gssapi/**/*.pyi", + "gssapi/**/py.typed", + + # --- Tests --- + + "idlelib/idle_test/", + "bs4/tests/", + "Cryptodome/SelfTest/", + "gssapi/tests/", + "keystoneauth1/tests/", + "lazy_loader/tests/", + "openstack/tests/", + "os_service_types/tests/", + "pbr/tests/", + "pkg_resources/tests/", + "pip/_vendor/colorama/tests/", + "psutil/tests/", + "requests_unixsocket/tests/", + "securesystemslib/_vendor/ed25519/test_data/", + "setuptools/_distutils/compilers/C/tests/", + "setuptools/_vendor/packaging/tests/", + "setuptools/_distutils/tests/", + "setuptools/tests/", + "simplejson/tests/", + "stevedore/tests/", + "supervisor/tests/", + "/test/", + "vertica_python/tests/", + "websocket/tests/", + "win32com/test/", +] diff --git a/.builders/scripts/repair_wheels.py b/.builders/scripts/repair_wheels.py index db5b2c125c789..ac181834250bc 100644 --- a/.builders/scripts/repair_wheels.py +++ b/.builders/scripts/repair_wheels.py @@ -5,65 +5,12 @@ import re import shutil import sys -import time from fnmatch import fnmatch -from functools import cache -from hashlib import sha256 from pathlib import Path -from typing import Iterator, NamedTuple +from typing import NamedTuple from zipfile import ZipFile -import urllib3 -from utils import extract_metadata, normalize_project_name - - -@cache -def get_wheel_hashes(project) -> dict[str, str]: - retry_wait = 2 - while True: - try: - response = urllib3.request( - 'GET', - f'https://pypi.org/simple/{project}', - headers={"Accept": "application/vnd.pypi.simple.v1+json"}, - ) - except urllib3.exceptions.HTTPError as e: - err_msg = f'Failed to fetch hashes for `{project}`: {e}' - else: - if response.status == 200: - break - - err_msg = f'Failed to fetch hashes for `{project}`, status code: {response.status}' - - print(err_msg) - print(f'Retrying in {retry_wait} seconds') - time.sleep(retry_wait) - retry_wait *= 2 - continue - - data = response.json() - return { - file['filename']: file['hashes']['sha256'] - for file in data['files'] - if file['filename'].endswith('.whl') and 'sha256' in file['hashes'] - } - - -def iter_wheels(source_dir: str) -> Iterator[Path]: - for entry in sorted(Path(source_dir).iterdir(), key=lambda entry: entry.name.casefold()): - if entry.suffix == '.whl' and entry.is_file(): - yield entry - - -def wheel_was_built(wheel: Path) -> bool: - project_metadata = extract_metadata(wheel) - project_name = normalize_project_name(project_metadata['Name']) - wheel_hashes = get_wheel_hashes(project_name) - if wheel.name not in wheel_hashes: - return True - - file_hash = sha256(wheel.read_bytes()).hexdigest() - return file_hash != wheel_hashes[wheel.name] +from utils import iter_wheels def find_patterns_in_wheel(wheel: Path, patterns: list[str]) -> list[str]: @@ -112,7 +59,7 @@ def check_unacceptable_files( sys.exit(1) -def repair_linux(source_dir: str, built_dir: str, external_dir: str) -> None: +def repair_linux(source_built_dir: str, source_external_dir: str, built_dir: str, external_dir: str) -> None: from auditwheel.patcher import Patchelf from auditwheel.policy import WheelPolicies from auditwheel.repair import repair_wheel @@ -138,19 +85,20 @@ def repair_linux(source_dir: str, built_dir: str, external_dir: str) -> None: policy['lib_whitelist'].remove('libz.so.1') del policy['symbol_versions']['ZLIB'] - for wheel in iter_wheels(source_dir): + for wheel in iter_wheels(source_external_dir): print(f'--> {wheel.name}') + print('Using existing wheel') - if not wheel_was_built(wheel): - print('Using existing wheel') + check_unacceptable_files( + wheel, + invalid_file_patterns=external_invalid_file_patterns, + ) + shutil.move(wheel, external_dir) + continue - check_unacceptable_files( - wheel, - invalid_file_patterns=external_invalid_file_patterns, - ) - shutil.move(wheel, external_dir) - continue + for wheel in iter_wheels(source_built_dir): + print(f'--> {wheel.name}') try: repair_wheel( policies, @@ -172,7 +120,7 @@ def repair_linux(source_dir: str, built_dir: str, external_dir: str) -> None: print('Repaired wheel') -def repair_windows(source_dir: str, built_dir: str, external_dir: str) -> None: +def repair_windows(source_built_dir: str, source_external_dir: str, built_dir: str, external_dir: str) -> None: import subprocess exclusions = ['mqic.dll'] @@ -183,19 +131,20 @@ def repair_windows(source_dir: str, built_dir: str, external_dir: str) -> None: '*.libs/libcrypto-3*.dll', ] - for wheel in iter_wheels(source_dir): + for wheel in iter_wheels(source_external_dir): print(f'--> {wheel.name}') + print('Using existing wheel') - if not wheel_was_built(wheel): - print('Using existing wheel') + check_unacceptable_files( + wheel, + invalid_file_patterns=external_invalid_file_patterns, + ) + shutil.move(wheel, external_dir) + continue - check_unacceptable_files( - wheel, - invalid_file_patterns=external_invalid_file_patterns, - ) - shutil.move(wheel, external_dir) - continue + for wheel in iter_wheels(source_built_dir): + print(f'--> {wheel.name}') # Platform independent wheels: move and rename to make platform specific wheel_name = WheelName.parse(wheel.name) if wheel_name.platform_tag == 'any': @@ -214,7 +163,7 @@ def repair_windows(source_dir: str, built_dir: str, external_dir: str) -> None: sys.exit(process.returncode) -def repair_darwin(source_dir: str, built_dir: str, external_dir: str) -> None: +def repair_darwin(source_built_dir: str, source_external_dir: str, built_dir: str, external_dir: str) -> None: from delocate import delocate_wheel from packaging.version import Version @@ -251,17 +200,17 @@ def repair_darwin(source_dir: str, built_dir: str, external_dir: str) -> None: def copy_filt_func(libname): return not any(excl.search(libname) for excl in exclusions) - + min_macos_version = Version(os.environ["MACOSX_DEPLOYMENT_TARGET"]) - for wheel in iter_wheels(source_dir): + for wheel in iter_wheels(source_external_dir): print(f'--> {wheel.name}') - if not wheel_was_built(wheel): - print('Using existing wheel') - - shutil.move(wheel, external_dir) - continue + print('Using existing wheel') + shutil.move(wheel, external_dir) + continue + for wheel in iter_wheels(source_built_dir): + print(f'--> {wheel.name}') # Platform independent wheels: move and rename to make platform specific wheel_name = WheelName.parse(wheel.name) if wheel_name.platform_tag == 'any': @@ -300,15 +249,16 @@ def main(): argparser = argparse.ArgumentParser( description='Repair wheels found in a directory with the platform-specific tool' ) - argparser.add_argument('--source-dir', required=True) + argparser.add_argument('--source-built-dir', required=True) + argparser.add_argument('--source-external-dir', required=True) argparser.add_argument('--built-dir', required=True) argparser.add_argument('--external-dir', required=True) args = argparser.parse_args() - print(f'Repairing wheels in: {args.source_dir}') + print(f'Repairing wheels in: {args.source_built_dir}') print(f'Outputting built wheels to: {args.built_dir}') print(f'Outputting external wheels to: {args.external_dir}') - REPAIR_FUNCTIONS[sys.platform](args.source_dir, args.built_dir, args.external_dir) + REPAIR_FUNCTIONS[sys.platform](args.source_built_dir, args.source_external_dir, args.built_dir, args.external_dir) if __name__ == '__main__': diff --git a/.builders/scripts/utils.py b/.builders/scripts/utils.py index 0750f1ba8237b..35e7436e69b21 100644 --- a/.builders/scripts/utils.py +++ b/.builders/scripts/utils.py @@ -1,34 +1,9 @@ -from __future__ import annotations - -import email -import re from pathlib import Path -from zipfile import ZipFile - -UNNORMALIZED_PROJECT_NAME_CHARS = re.compile(r'[-_.]+') - - -def normalize_project_name(name: str) -> str: - # https://peps.python.org/pep-0503/#normalized-names - return UNNORMALIZED_PROJECT_NAME_CHARS.sub('-', name).lower() - +from typing import Iterator -def extract_metadata(wheel: Path) -> email.Message: - with ZipFile(str(wheel)) as zip_archive: - for path in zip_archive.namelist(): - root = path.split('/', 1)[0] - if root.endswith('.dist-info'): - dist_info_dir = root - break - else: - message = f'Could not find the `.dist-info` directory in wheel: {wheel.name}' - raise RuntimeError(message) - try: - with zip_archive.open(f'{dist_info_dir}/METADATA') as zip_file: - metadata_file_contents = zip_file.read().decode('utf-8') - except KeyError: - message = f'Could not find a `METADATA` file in the `{dist_info_dir}` directory' - raise RuntimeError(message) from None +def iter_wheels(source_dir: str) -> Iterator[Path]: + for entry in sorted(Path(source_dir).iterdir(), key=lambda entry: entry.name.casefold()): + if entry.suffix == '.whl' and entry.is_file(): + yield entry - return email.message_from_string(metadata_file_contents) diff --git a/.builders/upload.py b/.builders/upload.py index 4a39e4bdd0655..867ce1c558ba3 100644 --- a/.builders/upload.py +++ b/.builders/upload.py @@ -111,7 +111,7 @@ def upload(targets_dir): if not is_valid_project_name(project_name): message = f'Invalid project name `{project_name}` found in wheel: {wheel.name}' raise RuntimeError(message) - + print(f'Project name: {project_name}') upload_data.append((normalize_project_name(project_name), project_metadata, wheel)) queued = len(upload_data)