From 605a80dfb46b23533b979e12189d9311b93ef502 Mon Sep 17 00:00:00 2001 From: HossyWorlds Date: Sat, 19 Jul 2025 21:35:51 +0900 Subject: [PATCH 1/4] feat: Add batch processing capability for directory conversion --- .../markitdown/src/markitdown/__main__.py | 110 +++++++++++++++++- packages/markitdown/tests/test_cli_misc.py | 40 ++++++- 2 files changed, 145 insertions(+), 5 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6b..ff29fb10 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -110,6 +110,25 @@ def main(): help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.", ) + parser.add_argument( + "-b", + "--batch", + action="store_true", + help="Process all supported files in a directory. If specified, filename should be a directory path.", + ) + + parser.add_argument( + "-r", + "--recursive", + action="store_true", + help="Process subdirectories recursively when using batch mode.", + ) + + parser.add_argument( + "--types", + help="Comma-separated list of file extensions to process in batch mode (e.g., pdf,docx,pptx). If not specified, all supported types are processed.", + ) + parser.add_argument("filename", nargs="?") args = parser.parse_args() @@ -186,18 +205,23 @@ def main(): else: markitdown = MarkItDown(enable_plugins=args.use_plugins) - if args.filename is None: + if args.batch: + if args.filename is None: + _exit_with_error("Directory path is required when using batch mode.") + + _handle_batch_processing(args, markitdown, stream_info) + elif args.filename is None: result = markitdown.convert_stream( sys.stdin.buffer, stream_info=stream_info, keep_data_uris=args.keep_data_uris, ) + _handle_output(args, result) else: result = markitdown.convert( args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris ) - - _handle_output(args, result) + _handle_output(args, result) def _handle_output(args, result: DocumentConverterResult): @@ -219,5 +243,85 @@ def _exit_with_error(message: str): sys.exit(1) +def _handle_batch_processing(args, markitdown: MarkItDown, stream_info): + """Handle batch processing of files in a directory""" + import os + from pathlib import Path + + input_dir = Path(args.filename) + if not input_dir.exists(): + _exit_with_error(f"Directory does not exist: {input_dir}") + if not input_dir.is_dir(): + _exit_with_error(f"Path is not a directory: {input_dir}") + + # Determine output directory + output_dir = Path(args.output) if args.output else input_dir / "converted" + output_dir.mkdir(parents=True, exist_ok=True) + + # Get supported file types + supported_extensions = { + 'pdf', 'docx', 'pptx', 'xlsx', 'xls', 'csv', 'txt', 'html', 'htm', + 'json', 'xml', 'rss', 'msg', 'zip', 'epub', 'jpg', 'jpeg', 'png', + 'gif', 'bmp', 'tiff', 'wav', 'mp3', 'm4a', 'mp4' + } + + # Parse user-specified types + if args.types: + user_types = {ext.strip().lower().lstrip('.') for ext in args.types.split(',')} + supported_extensions = supported_extensions.intersection(user_types) + + # Find files to process + pattern = "**/*" if args.recursive else "*" + files_to_process = [] + + for file_path in input_dir.glob(pattern): + if file_path.is_file(): + extension = file_path.suffix.lower().lstrip('.') + if extension in supported_extensions: + files_to_process.append(file_path) + + if not files_to_process: + print(f"No supported files found in {input_dir}") + return + + print(f"Found {len(files_to_process)} files to process") + + # Process files + processed = 0 + failed = 0 + + for i, file_path in enumerate(files_to_process, 1): + try: + # Calculate relative path and output path + rel_path = file_path.relative_to(input_dir) + output_file = output_dir / rel_path.with_suffix('.md') + output_file.parent.mkdir(parents=True, exist_ok=True) + + print(f"[{i}/{len(files_to_process)}] Processing: {rel_path}") + + # Convert file + result = markitdown.convert( + str(file_path), + stream_info=stream_info, + keep_data_uris=args.keep_data_uris + ) + + # Write output + with open(output_file, 'w', encoding='utf-8') as f: + f.write(result.markdown) + + print(f"✓ Success: {rel_path}") + processed += 1 + + except Exception as e: + print(f"✗ Failed: {rel_path} - {e}") + failed += 1 + + print(f"\nBatch processing complete!") + print(f"Success: {processed} files") + print(f"Failed: {failed} files") + print(f"Output directory: {output_dir}") + + if __name__ == "__main__": main() diff --git a/packages/markitdown/tests/test_cli_misc.py b/packages/markitdown/tests/test_cli_misc.py index cf6c9ccc..ea6e0791 100644 --- a/packages/markitdown/tests/test_cli_misc.py +++ b/packages/markitdown/tests/test_cli_misc.py @@ -8,7 +8,7 @@ def test_version() -> None: result = subprocess.run( - ["python", "-m", "markitdown", "--version"], capture_output=True, text=True + ["python3", "-m", "markitdown", "--version"], capture_output=True, text=True ) assert result.returncode == 0, f"CLI exited with error: {result.stderr}" @@ -17,7 +17,7 @@ def test_version() -> None: def test_invalid_flag() -> None: result = subprocess.run( - ["python", "-m", "markitdown", "--foobar"], capture_output=True, text=True + ["python3", "-m", "markitdown", "--foobar"], capture_output=True, text=True ) assert result.returncode != 0, f"CLI exited with error: {result.stderr}" @@ -27,8 +27,44 @@ def test_invalid_flag() -> None: assert "SYNTAX" in result.stderr, "Expected 'SYNTAX' to appear in STDERR" +def test_batch_help() -> None: + """Test that batch options are available in help""" + result = subprocess.run( + ["python3", "-m", "markitdown", "--help"], capture_output=True, text=True + ) + + assert result.returncode == 0, f"CLI exited with error: {result.stderr}" + assert "--batch" in result.stdout, "Expected --batch option in help" + assert "--recursive" in result.stdout, "Expected --recursive option in help" + assert "--types" in result.stdout, "Expected --types option in help" + + +def test_batch_missing_directory() -> None: + """Test that batch mode requires a directory""" + result = subprocess.run( + ["python3", "-m", "markitdown", "--batch"], capture_output=True, text=True + ) + + assert result.returncode != 0, f"CLI exited with error: {result.stderr}" + assert "Directory path is required" in result.stdout, "Expected directory requirement message" + + +def test_batch_nonexistent_directory() -> None: + """Test that batch mode handles nonexistent directory""" + result = subprocess.run( + ["python3", "-m", "markitdown", "--batch", "/nonexistent/directory"], + capture_output=True, text=True + ) + + assert result.returncode != 0, f"CLI exited with error: {result.stderr}" + assert "Directory does not exist" in result.stdout, "Expected directory existence check" + + if __name__ == "__main__": """Runs this file's tests from the command line.""" test_version() test_invalid_flag() + test_batch_help() + test_batch_missing_directory() + test_batch_nonexistent_directory() print("All tests passed!") From c695e9a8161bfe5a66faa6ea67590bba2a69b7c8 Mon Sep 17 00:00:00 2001 From: HossyWorlds Date: Sun, 20 Jul 2025 12:24:04 +0900 Subject: [PATCH 2/4] fix: python3 to python command --- packages/markitdown/tests/test_cli_misc.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/markitdown/tests/test_cli_misc.py b/packages/markitdown/tests/test_cli_misc.py index ea6e0791..ca487f20 100644 --- a/packages/markitdown/tests/test_cli_misc.py +++ b/packages/markitdown/tests/test_cli_misc.py @@ -8,7 +8,7 @@ def test_version() -> None: result = subprocess.run( - ["python3", "-m", "markitdown", "--version"], capture_output=True, text=True + ["python", "-m", "markitdown", "--version"], capture_output=True, text=True ) assert result.returncode == 0, f"CLI exited with error: {result.stderr}" @@ -17,7 +17,7 @@ def test_version() -> None: def test_invalid_flag() -> None: result = subprocess.run( - ["python3", "-m", "markitdown", "--foobar"], capture_output=True, text=True + ["python", "-m", "markitdown", "--foobar"], capture_output=True, text=True ) assert result.returncode != 0, f"CLI exited with error: {result.stderr}" @@ -30,7 +30,7 @@ def test_invalid_flag() -> None: def test_batch_help() -> None: """Test that batch options are available in help""" result = subprocess.run( - ["python3", "-m", "markitdown", "--help"], capture_output=True, text=True + ["python", "-m", "markitdown", "--help"], capture_output=True, text=True ) assert result.returncode == 0, f"CLI exited with error: {result.stderr}" @@ -42,7 +42,7 @@ def test_batch_help() -> None: def test_batch_missing_directory() -> None: """Test that batch mode requires a directory""" result = subprocess.run( - ["python3", "-m", "markitdown", "--batch"], capture_output=True, text=True + ["python", "-m", "markitdown", "--batch"], capture_output=True, text=True ) assert result.returncode != 0, f"CLI exited with error: {result.stderr}" @@ -52,7 +52,7 @@ def test_batch_missing_directory() -> None: def test_batch_nonexistent_directory() -> None: """Test that batch mode handles nonexistent directory""" result = subprocess.run( - ["python3", "-m", "markitdown", "--batch", "/nonexistent/directory"], + ["python", "-m", "markitdown", "--batch", "/nonexistent/directory"], capture_output=True, text=True ) From 8b7f419acd52a5c723250e52e7bda39e8e9d5188 Mon Sep 17 00:00:00 2001 From: HossyWorlds Date: Sun, 20 Jul 2025 12:25:55 +0900 Subject: [PATCH 3/4] fix: implement batch processing with existing validation system --- .../markitdown/src/markitdown/__main__.py | 43 ++++++++----------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index ff29fb10..3e8c2c2d 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -8,6 +8,7 @@ from importlib.metadata import entry_points from .__about__ import __version__ from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult +from ._exceptions import UnsupportedFormatException, FileConversionException def main(): @@ -245,8 +246,8 @@ def _exit_with_error(message: str): def _handle_batch_processing(args, markitdown: MarkItDown, stream_info): """Handle batch processing of files in a directory""" - import os from pathlib import Path + from ._exceptions import UnsupportedFormatException, FileConversionException input_dir = Path(args.filename) if not input_dir.exists(): @@ -258,46 +259,33 @@ def _handle_batch_processing(args, markitdown: MarkItDown, stream_info): output_dir = Path(args.output) if args.output else input_dir / "converted" output_dir.mkdir(parents=True, exist_ok=True) - # Get supported file types - supported_extensions = { - 'pdf', 'docx', 'pptx', 'xlsx', 'xls', 'csv', 'txt', 'html', 'htm', - 'json', 'xml', 'rss', 'msg', 'zip', 'epub', 'jpg', 'jpeg', 'png', - 'gif', 'bmp', 'tiff', 'wav', 'mp3', 'm4a', 'mp4' - } - - # Parse user-specified types - if args.types: - user_types = {ext.strip().lower().lstrip('.') for ext in args.types.split(',')} - supported_extensions = supported_extensions.intersection(user_types) - - # Find files to process + # Find all files to process pattern = "**/*" if args.recursive else "*" - files_to_process = [] + all_files = [] for file_path in input_dir.glob(pattern): if file_path.is_file(): - extension = file_path.suffix.lower().lstrip('.') - if extension in supported_extensions: - files_to_process.append(file_path) + all_files.append(file_path) - if not files_to_process: - print(f"No supported files found in {input_dir}") + if not all_files: + print(f"No files found in {input_dir}") return - print(f"Found {len(files_to_process)} files to process") + print(f"Found {len(all_files)} files to process") # Process files processed = 0 failed = 0 + unsupported = 0 - for i, file_path in enumerate(files_to_process, 1): + for i, file_path in enumerate(all_files, 1): try: # Calculate relative path and output path rel_path = file_path.relative_to(input_dir) output_file = output_dir / rel_path.with_suffix('.md') output_file.parent.mkdir(parents=True, exist_ok=True) - print(f"[{i}/{len(files_to_process)}] Processing: {rel_path}") + print(f"[{i}/{len(all_files)}] Processing: {rel_path}") # Convert file result = markitdown.convert( @@ -313,13 +301,20 @@ def _handle_batch_processing(args, markitdown: MarkItDown, stream_info): print(f"✓ Success: {rel_path}") processed += 1 + except UnsupportedFormatException: + print(f"⚠ Skipped (unsupported): {rel_path}") + unsupported += 1 + except FileConversionException as e: + print(f"✗ Failed (conversion error): {rel_path} - {e}") + failed += 1 except Exception as e: - print(f"✗ Failed: {rel_path} - {e}") + print(f"✗ Failed (unexpected error): {rel_path} - {e}") failed += 1 print(f"\nBatch processing complete!") print(f"Success: {processed} files") print(f"Failed: {failed} files") + print(f"Unsupported: {unsupported} files") print(f"Output directory: {output_dir}") From 6150983e377f85d2f16f8aaaa7c62147a0555e54 Mon Sep 17 00:00:00 2001 From: HossyWorlds Date: Fri, 1 Aug 2025 18:42:02 +0900 Subject: [PATCH 4/4] fix: prevent file overwriting in batch processing mode Changed from using with_suffix('.md') to appending '.md' to preserve original filenames. This prevents files with same base name but different extensions (e.g., test.txt, test.py, test.md) from overwriting each other in the output directory. Fixes issue where batch processing would overwrite files, causing data loss. --- packages/markitdown/src/markitdown/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 3e8c2c2d..5ee402a8 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -282,7 +282,7 @@ def _handle_batch_processing(args, markitdown: MarkItDown, stream_info): try: # Calculate relative path and output path rel_path = file_path.relative_to(input_dir) - output_file = output_dir / rel_path.with_suffix('.md') + output_file = output_dir / Path(str(rel_path) + '.md') output_file.parent.mkdir(parents=True, exist_ok=True) print(f"[{i}/{len(all_files)}] Processing: {rel_path}")