Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 229 additions & 0 deletions pipeline/redirect_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
"""Generate a redirect map from openMINDS schema type URIs
to their rendered documentation pages on Read the Docs.

Run this script after cloning/building the schema sources:

python -m pipeline.redirect_map

It produces a JSON file named `redirect_map.json` in the project root, e.g.

"/types/Subject": "https://openminds.om-i.org/en/latest/schema_specifications/core/research/subject.html#subject"
Comment on lines +8 to +10
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
It produces a JSON file named `redirect_map.json` in the project root, e.g.
"/types/Subject": "https://openminds.om-i.org/en/latest/schema_specifications/core/research/subject.html#subject"
It produces a file named `.htaccess` in the project root, e.g.
Redirect 301 "/types/Subject" "https://openminds.docs.om-i.org/en/latest/schema_specifications/core/research/subject.html#subject"


Serve the resulting map with your static-site host or edge platform
(e.g. Netlify, Vercel, Cloudflare Workers) to perform the redirects.
"""

from __future__ import annotations

import os
from typing import Dict

from pipeline.utils import clone_sources, SchemaLoader, InstanceLoader
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

# Helper functions

def _pluralize(term: str) -> str:
"""Return an English plural for simple openMINDS instance types."""
if term.endswith("s"):
return f"{term}es"
if term.endswith("y"):
return f"{term[:-1]}ies"
return f"{term}s"


def _anchorize(name: str) -> str:
"""Convert an instance name to the anchor used in HTML (lower-case, '_' and '.' → '-')."""
return name.replace("_", "-").replace(".", "-").lower()

# ---------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------

DOCS_BASE_URL = "https://openminds.docs.om-i.org" # without trailing slash
DOCS_VERSION_SLUG = "latest" # RTD version alias to use
OUTPUT_FILENAME = ".htaccess" # output path (project root)


# ---------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------


def generate_redirect_map() -> Dict[str, str]:
"""
Return a {uri -> url} mapping for *all* schema types discovered
across all versions in the openMINDS sources.
"""
# Ensure fresh sources
clone_sources()

loader = SchemaLoader()
redirect_map: Dict[str, str] = {}

# Track which version each schema appears in (prefer latest available)
schema_versions = {}

# Process versions in priority order: latest first, then highest version numbers
all_versions = loader.get_schema_versions()
ordered_versions = []
if "latest" in all_versions:
ordered_versions.append("latest")
# Sort versioned releases in descending order (v4.0, v3.0, v2.0, v1.0)
versioned = [v for v in all_versions if v.startswith("v") and v != "latest"]
versioned.sort(reverse=True)
ordered_versions.extend(versioned)

for version in ordered_versions:
abs_paths = loader.find_schemas(version)
rel_paths = loader.get_relative_paths_for_schema_docu(abs_paths, version)

for schema_name, rel_path in rel_paths.items():
# Only record if we haven't seen this schema yet (prioritizes earlier processed versions)
if schema_name not in schema_versions:
schema_versions[schema_name] = {
"version": version,
"rel_path": rel_path
}

# Generate URLs using the appropriate version for each schema
for schema_name, info in schema_versions.items():
uri = f"/types/{schema_name}"
version_slug = DOCS_VERSION_SLUG if info["version"] == "latest" else info["version"]
url = (
f"{DOCS_BASE_URL}/en/{version_slug}/schema_specifications/"
f"{info['rel_path']}.html#{schema_name.lower()}"
)
redirect_map[uri] = url

# ----------------------------------------------------------
# Instance redirects
# ----------------------------------------------------------
iloader = InstanceLoader()
simple_types = {"brainAtlases", "contentTypes", "commonCoordinateSpaces", "licenses"}
subpage_types = {"parcellationEntities", "brainAtlasVersions", "commonCoordinateSpaceVersions"}
subpage2_types = {"parcellationEntityVersions"}

for version in iloader.get_instance_versions():
abs_paths = iloader.find_instances(version)
base_dir = os.path.join(iloader.instances_sources, version)
for ap in abs_paths:
rel = os.path.relpath(ap, start=base_dir)
inst_type = rel.split(os.sep)[0]
filename = os.path.basename(ap).replace(".jsonld", "")
plural = _pluralize(inst_type)

# Determine page location based on instance type
if inst_type in simple_types:
page_path = f"instance_libraries/{inst_type}.html"
elif inst_type in subpage_types:
page_heading = filename.split("_")[0]
page_path = f"instance_libraries/{inst_type}/{page_heading}.html"
elif inst_type in subpage2_types:
page_heading = "_".join(filename.split("_")[:2])
page_path = f"instance_libraries/{inst_type}/{page_heading}.html"
else:
# Handle terminologies with subdirectories
rel_parts = rel.split(os.sep)
if len(rel_parts) > 1:
# Extract the subdirectory (e.g., 'molecularEntity' from 'terminologies/molecularEntity/filename.jsonld')
subdir = rel_parts[1]
page_path = f"instance_libraries/terminologies/{subdir}.html"
else:
page_path = f"instance_libraries/terminologies/{inst_type}.html"

anchor = _anchorize(filename)
uri = f"/instances/{inst_type}/{filename}"
Copy link
Member

@Raphael-Gazzotti Raphael-Gazzotti Jul 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
uri = f"/instances/{inst_type}/{filename}"
if inst_type in ["Licenses", "contentTypes"]:
uri = f"/instances/{inst_type}/{filename}"
elif inst_type == 'terminologies':
uri = f"/instances/{subdir}/{filename}"
elif inst_type == 'parcellationEntities':
uri = f"/instances/parcellationEntity/{filename}"
elif inst_type == 'brainAtlases':
uri = f"/instances/brainAtlas/{filename}"
else:
uri = f"/instances/{inst_type[:-1]}/{filename}"

url = f"{DOCS_BASE_URL}/en/{DOCS_VERSION_SLUG}/{page_path}#{anchor}"
redirect_map[uri] = url

return redirect_map


# ---------------------------------------------------------------------
# Verification helpers
# ---------------------------------------------------------------------


def _url_works(url: str, timeout: float = 10.0) -> bool:
"""Return True if the URL returns HTTP <400 (HEAD, falling back to GET)."""
try:
r = requests.head(url, timeout=timeout, allow_redirects=True)
if r.status_code < 400:
return True
# Some servers don't handle HEAD properly – try GET but fetch nothing
r = requests.get(url, timeout=timeout, allow_redirects=True, stream=True)
return r.status_code < 400
except Exception:
return False


def verify_redirect_map(filename: str = OUTPUT_FILENAME, max_workers: int = 16) -> None:
"""
Load an existing .htaccess file and check every target URL.
"""
redirects = {}

# Parse .htaccess format: Redirect 301 "/uri" "url"
with open(filename, "r", encoding="utf-8") as fp:
for line in fp:
line = line.strip()
if line.startswith('Redirect 301 "') and line.count('"') >= 4:
# Extract URI and URL from: Redirect 301 "/uri" "url"
parts = line.split('"')
if len(parts) >= 4:
uri = parts[1]
url = parts[3]
redirects[uri] = url

# 1. Identify broken URLs in parallel
broken_keys = []

with ThreadPoolExecutor(max_workers=max_workers) as pool:
future_to_key = {pool.submit(_url_works, url): key for key, url in redirects.items()}
for fut in as_completed(future_to_key):
key = future_to_key[fut]
try:
ok = fut.result()
except Exception:
ok = False
if not ok:
print(f"Broken link found: {key} -> {redirects[key]}")
broken_keys.append(key)

if not broken_keys:
print("All redirect targets are reachable – nothing to fix.")
return

print(f"{len(broken_keys)} broken links detected.")


def main() -> None:
redirects = generate_redirect_map()

# Write .htaccess file
with open(OUTPUT_FILENAME, "w", encoding="utf-8") as fp:
fp.write("# openMINDS redirect rules\n")
fp.write("# Generated automatically by pipeline/redirect_map.py\n\n")

# Sort by URI for consistent output
sorted_redirects = sorted(redirects.items())

for uri, url in sorted_redirects:
# Apache redirect rule format: Redirect 301 /from /to
fp.write(f'Redirect 301 "{uri}" "{url}"\n')

# Add generic redirect for all other paths
fp.write(f'\n# Generic redirect for all other paths\n')
fp.write(f'RedirectMatch 301 /(.*) {DOCS_BASE_URL}/$1\n')

print(f"Wrote {len(redirects)} redirect entries to {OUTPUT_FILENAME}")

# Optional: immediately verify all links
print("Verifying redirect targets...")
verify_redirect_map(OUTPUT_FILENAME)


if __name__ == "__main__":
main()