feat(deploy): add automated OmniParser deployment

abrichr · web-flow · commit 8acd7c0c09b3 · 2025-02-19T16:48:45.000-05:00
* add working omniparser deploy.py, Dockerfile, pyproject.toml, README.md, .env.example, .dockerignore, client.py
diff --git a/deploy/.env.example b/deploy/.env.example
@@ -0,0 +1,3 @@
+AWS_ACCESS_KEY_ID=
+AWS_SECRET_ACCESS_KEY=
+AWS_REGION=
diff --git a/deploy/README.md b/deploy/README.md
@@ -0,0 +1,10 @@
+```
+# First time setup
+cd deploy
+uv venv
+source .venv/bin/activate
+uv pip install -e .
+
+# Subsequent usage
+python deploy/models/omniparser/deploy.py start
+```
diff --git a/deploy/deploy/models/omniparser/.dockerignore b/deploy/deploy/models/omniparser/.dockerignore
@@ -0,0 +1,20 @@
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env
+pip-log.txt
+pip-delete-this-directory.txt
+.tox
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.log
+.pytest_cache
+.env
+.venv
+.DS_Store
diff --git a/deploy/deploy/models/omniparser/Dockerfile b/deploy/deploy/models/omniparser/Dockerfile
@@ -0,0 +1,59 @@
+FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    git-lfs \
+    wget \
+    libgl1 \
+    libglib2.0-0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && git lfs install
+
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
+    bash miniconda.sh -b -p /opt/conda && \
+    rm miniconda.sh
+ENV PATH="/opt/conda/bin:$PATH"
+
+RUN conda create -n omni python=3.12 && \
+    echo "source activate omni" > ~/.bashrc
+ENV CONDA_DEFAULT_ENV=omni
+ENV PATH="/opt/conda/envs/omni/bin:$PATH"
+
+WORKDIR /app
+
+RUN git clone https://github.com/microsoft/OmniParser.git && \
+    cd OmniParser && \
+    git lfs install && \
+    git lfs pull
+
+WORKDIR /app/OmniParser
+
+RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
+    pip uninstall -y opencv-python opencv-python-headless && \
+    pip install --no-cache-dir opencv-python-headless==4.8.1.78 && \
+    pip install -r requirements.txt && \
+    pip install huggingface_hub fastapi uvicorn
+
+# Download V2 weights
+RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
+    mkdir -p /app/OmniParser/weights && \
+    cd /app/OmniParser && \
+    rm -rf weights/icon_detect weights/icon_caption weights/icon_caption_florence && \
+    for folder in icon_caption icon_detect; do \
+        huggingface-cli download microsoft/OmniParser-v2.0 --local-dir weights --repo-type model --include "$folder/*"; \
+    done && \
+    mv weights/icon_caption weights/icon_caption_florence
+
+# Pre-download OCR models during build
+RUN . /opt/conda/etc/profile.d/conda.sh && conda activate omni && \
+    cd /app/OmniParser && \
+    python3 -c "import easyocr; reader = easyocr.Reader(['en']); print('Downloaded EasyOCR model')" && \
+    python3 -c "from paddleocr import PaddleOCR; ocr = PaddleOCR(lang='en', use_angle_cls=False, use_gpu=False, show_log=False); print('Downloaded PaddleOCR model')"
+
+CMD ["python3", "/app/OmniParser/omnitool/omniparserserver/omniparserserver.py", \
+     "--som_model_path", "/app/OmniParser/weights/icon_detect/model.pt", \
+     "--caption_model_path", "/app/OmniParser/weights/icon_caption_florence", \
+     "--device", "cuda", \
+     "--BOX_TRESHOLD", "0.05", \
+     "--host", "0.0.0.0", \
+     "--port", "8000"]
diff --git a/deploy/deploy/models/omniparser/client.py b/deploy/deploy/models/omniparser/client.py
@@ -0,0 +1,128 @@
+"""Client module for interacting with the OmniParser server."""
+
+import base64
+import fire
+import requests
+
+from loguru import logger
+from PIL import Image, ImageDraw
+
+
+def image_to_base64(image_path: str) -> str:
+    """Convert an image file to base64 string.
+
+    Args:
+        image_path: Path to the image file
+
+    Returns:
+        str: Base64 encoded string of the image
+    """
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+
+
+def plot_results(
+    original_image_path: str,
+    som_image_base64: str,
+    parsed_content_list: list[dict[str, list[float]]],
+) -> None:
+    """Plot parsing results on the original image.
+
+    Args:
+        original_image_path: Path to the original image
+        som_image_base64: Base64 encoded SOM image
+        parsed_content_list: List of parsed content with bounding boxes
+    """
+    # Open original image
+    image = Image.open(original_image_path)
+    width, height = image.size
+
+    # Create drawable image
+    draw = ImageDraw.Draw(image)
+
+    # Draw bounding boxes and labels
+    for item in parsed_content_list:
+        # Get normalized coordinates and convert to pixel coordinates
+        x1, y1, x2, y2 = item["bbox"]
+        x1 = int(x1 * width)
+        y1 = int(y1 * height)
+        x2 = int(x2 * width)
+        y2 = int(y2 * height)
+
+        label = item["content"]
+
+        # Draw rectangle
+        draw.rectangle([(x1, y1), (x2, y2)], outline="red", width=2)
+
+        # Draw label background
+        text_bbox = draw.textbbox((x1, y1), label)
+        draw.rectangle(
+            [text_bbox[0] - 2, text_bbox[1] - 2, text_bbox[2] + 2, text_bbox[3] + 2],
+            fill="white",
+        )
+
+        # Draw label text
+        draw.text((x1, y1), label, fill="red")
+
+    # Show image
+    image.show()
+
+
+def parse_image(
+    image_path: str,
+    server_url: str,
+) -> None:
+    """Parse an image using the OmniParser server.
+
+    Args:
+        image_path: Path to the image file
+        server_url: URL of the OmniParser server
+    """
+    # Remove trailing slash from server_url if present
+    server_url = server_url.rstrip("/")
+
+    # Convert image to base64
+    base64_image = image_to_base64(image_path)
+
+    # Prepare request
+    url = f"{server_url}/parse/"
+    payload = {"base64_image": base64_image}
+
+    try:
+        # First, check if the server is available
+        probe_url = f"{server_url}/probe/"
+        probe_response = requests.get(probe_url)
+        probe_response.raise_for_status()
+        logger.info("Server is available")
+
+        # Make request to API
+        response = requests.post(url, json=payload)
+        response.raise_for_status()
+
+        # Parse response
+        result = response.json()
+        som_image_base64 = result["som_image_base64"]
+        parsed_content_list = result["parsed_content_list"]
+
+        # Plot results
+        plot_results(image_path, som_image_base64, parsed_content_list)
+
+        # Print latency
+        logger.info(f"API Latency: {result['latency']:.2f} seconds")
+
+    except requests.exceptions.ConnectionError:
+        logger.error(f"Error: Could not connect to server at {server_url}")
+        logger.error("Please check if the server is running and the URL is correct")
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Error making request to API: {e}")
+    except Exception as e:
+        logger.error(f"Error: {e}")
+
+
+def main() -> None:
+    """Main entry point for the client application."""
+    fire.Fire(parse_image)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deploy/deploy/models/omniparser/deploy.py b/deploy/deploy/models/omniparser/deploy.py
diff --git a/deploy/pyproject.toml b/deploy/pyproject.toml

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+AWS_ACCESS_KEY_ID=`
	`2`	`+AWS_SECRET_ACCESS_KEY=`
	`3`	`+AWS_REGION=`