From 9006bf36b8d7cb7c8b146b462d1080df2f4c3bed Mon Sep 17 00:00:00 2001 From: Yingjie Wang Date: Fri, 17 Apr 2026 17:37:17 -0400 Subject: [PATCH] fix: fix the mineru call --- src/paperlib/cli.py | 23 +- src/paperlib/converter/mineru_converter.py | 100 ++++++--- tests/test_converter.py | 233 +++++++++++++++++++++ 3 files changed, 327 insertions(+), 29 deletions(-) create mode 100644 tests/test_converter.py diff --git a/src/paperlib/cli.py b/src/paperlib/cli.py index e601738..93c517b 100644 --- a/src/paperlib/cli.py +++ b/src/paperlib/cli.py @@ -94,6 +94,12 @@ def _build_parser() -> argparse.ArgumentParser: ) convert_parser.add_argument("--library", "-L", default=".", help="Library root") convert_parser.add_argument("--paper-id", help="Convert specific paper by ID") + convert_parser.add_argument( + "--retry-failed", action="store_true", help="Retry failed conversions" + ) + convert_parser.add_argument( + "--force", action="store_true", help="Force reconvert successful papers" + ) convert_parser.set_defaults(handler=_handle_convert) # Reindex command @@ -322,9 +328,20 @@ def _handle_convert(args: argparse.Namespace) -> int: print(f"Paper not found: {args.paper_id}") return 1 else: - # Convert all pending papers - success_count, failure_count = converter.convert_all_pending() - msg = f"Complete: {success_count} successful, {failure_count} failed" + # Convert papers based on flags + success_count, failure_count = converter.convert_all_pending( + retry_failed=args.retry_failed, force=args.force + ) + + # Show what was attempted + if args.force: + action = "Force converted" + elif args.retry_failed: + action = "Converted pending and retried failed" + else: + action = "Converted pending" + + msg = f"{action}: {success_count} successful, {failure_count} failed" print(msg) return 0 if failure_count == 0 else 1 diff --git a/src/paperlib/converter/mineru_converter.py b/src/paperlib/converter/mineru_converter.py index 9836e81..4a02f58 100644 --- a/src/paperlib/converter/mineru_converter.py +++ b/src/paperlib/converter/mineru_converter.py @@ -18,16 +18,26 @@ class MinerUConverter: self.logger = logging.getLogger(__name__) def is_mineru_available(self) -> bool: - """Check if MinerU is available in the environment.""" + """Check if MinerU CLI is available in the environment.""" try: + # Check if mineru command is available result = subprocess.run( - [sys.executable, "-c", "import mineru"], + ["mineru", "--version"], capture_output=True, check=False, ) return result.returncode == 0 except (subprocess.SubprocessError, FileNotFoundError): - return False + # Fallback: check if mineru module is importable + try: + result = subprocess.run( + [sys.executable, "-c", "import mineru"], + capture_output=True, + check=False, + ) + return result.returncode == 0 + except (subprocess.SubprocessError, FileNotFoundError): + return False def convert_paper(self, metadata: PaperMetadata) -> bool: """Convert a paper's PDF to Markdown using MinerU.""" @@ -52,18 +62,23 @@ class MinerUConverter: self.storage_manager.update_paper_metadata(metadata) try: + # Create temporary output directory in cache + cache_dir = self.storage_manager.library_paths.cache_dir + temp_output_dir = cache_dir / f"mineru_temp_{metadata.paper_id}" + temp_output_dir.mkdir(exist_ok=True) + # Run MinerU conversion log_file = logs_dir / "mineru.log" - # MinerU command + # Correct MinerU command cmd = [ - sys.executable, - "-m", - "magic_pdf.pipe.UNIPipe", - "--pdf", + "mineru", + "-p", str(pdf_path), - "--output-dir", - str(paths["directory"]), + "-o", + str(temp_output_dir), + "-b", + "pipeline", # CPU-only mode for compatibility ] self.logger.info(f"Running MinerU conversion: {' '.join(cmd)}") @@ -73,24 +88,30 @@ class MinerUConverter: cmd, stdout=log, stderr=subprocess.STDOUT, - cwd=paths["directory"], check=False, ) # Check if conversion was successful if result.returncode == 0: - # MinerU typically outputs markdown files, try to find the main one - # Look for common output patterns - markdown_candidates = list(paths["directory"].glob("*.md")) - if not markdown_candidates: - # Try subdirectories - markdown_candidates = list(paths["directory"].rglob("*.md")) + # MinerU outputs to // + pdf_stem = pdf_path.stem # Get filename without .pdf extension + mineru_output_dir = temp_output_dir / pdf_stem + expected_markdown = mineru_output_dir / f"{pdf_stem}.md" + expected_images = mineru_output_dir / "images" - if markdown_candidates: - # Use the first markdown file found, or rename if needed - main_md = markdown_candidates[0] - if main_md != markdown_path: - main_md.rename(markdown_path) + if expected_markdown.exists(): + # Move markdown file to paper directory + expected_markdown.rename(markdown_path) + + # Move images directory if it exists + if expected_images.exists(): + assets_target = paths["assets"] + if assets_target.exists(): + # Remove existing assets directory + import shutil + + shutil.rmtree(assets_target) + expected_images.rename(assets_target) # Update metadata metadata.conversion_status = ConversionStatus.SUCCESS @@ -99,9 +120,16 @@ class MinerUConverter: self.logger.info( f"Successfully converted {pdf_path} to {markdown_path}" ) + + # Clean up temporary directory + import shutil + + shutil.rmtree(temp_output_dir) return True else: - self.logger.error("No markdown output found after conversion") + self.logger.error( + f"Expected markdown file not found: {expected_markdown}" + ) metadata.conversion_status = ConversionStatus.FAILED self.storage_manager.update_paper_metadata(metadata) return False @@ -118,14 +146,34 @@ class MinerUConverter: metadata.conversion_status = ConversionStatus.FAILED self.storage_manager.update_paper_metadata(metadata) return False + finally: + # Ensure cleanup of temp directory + if "temp_output_dir" in locals() and temp_output_dir.exists(): + import shutil - def convert_all_pending(self) -> tuple[int, int]: - """Convert all papers with pending conversion status.""" + shutil.rmtree(temp_output_dir, ignore_errors=True) + + def convert_all_pending( + self, retry_failed: bool = False, force: bool = False + ) -> tuple[int, int]: + """Convert papers based on their conversion status.""" success_count = 0 failure_count = 0 for metadata in self.storage_manager.list_all_papers(): - if metadata.conversion_status == ConversionStatus.PENDING: + should_convert = False + + if force: + # Force convert all papers + should_convert = True + elif metadata.conversion_status == ConversionStatus.PENDING: + # Convert pending papers + should_convert = True + elif retry_failed and metadata.conversion_status == ConversionStatus.FAILED: + # Retry failed conversions if requested + should_convert = True + + if should_convert: if self.convert_paper(metadata): success_count += 1 else: diff --git a/tests/test_converter.py b/tests/test_converter.py new file mode 100644 index 0000000..2cadc2c --- /dev/null +++ b/tests/test_converter.py @@ -0,0 +1,233 @@ +"""Tests for paperlib PDF converter.""" + +import shutil +from pathlib import Path +from unittest.mock import Mock, patch + +import pytest + +from paperlib.config import LibraryPaths +from paperlib.converter import MinerUConverter +from paperlib.models import ConversionStatus, PaperMetadata, SourceType +from paperlib.storage import PaperStorageManager + + +class TestMinerUConverter: + """Test MinerUConverter functionality.""" + + @pytest.fixture + def temp_library(self): + """Create a temporary library for testing.""" + temp_dir = Path("./.tmp") / f"test_converter_{hash(self)}" + temp_dir.mkdir(parents=True, exist_ok=True) + library_paths = LibraryPaths.from_root(temp_dir) + library_paths.create_directories() + + yield library_paths + + # Cleanup + if temp_dir.exists(): + shutil.rmtree(temp_dir) + + @pytest.fixture + def storage_manager(self, temp_library): + """Create a storage manager for testing.""" + return PaperStorageManager(temp_library) + + @pytest.fixture + def converter(self, storage_manager): + """Create a MinerUConverter for testing.""" + return MinerUConverter(storage_manager) + + @pytest.fixture + def sample_metadata(self, storage_manager): + """Create sample paper metadata for testing.""" + # Create a sample PDF file + pdf_file = Path("./.tmp") / f"test_convert_{hash(self)}.pdf" + with pdf_file.open("wb") as f: + f.write(b"%PDF-1.4\n") + f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n") + f.write(b"%%EOF\n") + + # Store the paper + metadata = storage_manager.store_paper( + pdf_path=pdf_file, + source_type=SourceType.LOCAL, + title="Test Paper for Conversion", + ) + + return metadata + + @patch("subprocess.run") + def test_is_mineru_available_cli(self, mock_run, converter): + """Test MinerU availability check using CLI.""" + # Mock successful mineru --version command + mock_run.return_value.returncode = 0 + + assert converter.is_mineru_available() is True + mock_run.assert_called_with( + ["mineru", "--version"], + capture_output=True, + check=False, + ) + + @patch("subprocess.run") + def test_is_mineru_available_fallback(self, mock_run, converter): + """Test MinerU availability fallback to import check.""" + # Mock mineru command not found, but module available + mock_run.side_effect = [ + Mock(returncode=1), # mineru --version fails + Mock(returncode=0), # import mineru succeeds + ] + + assert converter.is_mineru_available() is True + assert mock_run.call_count == 2 + + @patch("subprocess.run") + def test_is_mineru_unavailable(self, mock_run, converter): + """Test when MinerU is completely unavailable.""" + # Mock both command and import failing + mock_run.side_effect = [ + Mock(returncode=1), # mineru --version fails + Mock(returncode=1), # import mineru fails + ] + + assert converter.is_mineru_available() is False + + @patch("subprocess.run") + def test_convert_paper_success(self, mock_run, converter, sample_metadata): + """Test successful paper conversion.""" + # Mock successful mineru command + mock_run.return_value.returncode = 0 + + # Create expected output structure in temp cache + cache_dir = converter.storage_manager.library_paths.cache_dir + temp_output_dir = cache_dir / f"mineru_temp_{sample_metadata.paper_id}" + pdf_stem = "test_convert_" + str(hash(sample_metadata)) + mineru_output_dir = temp_output_dir / pdf_stem + mineru_output_dir.mkdir(parents=True, exist_ok=True) + + # Create expected output files + markdown_file = mineru_output_dir / f"{pdf_stem}.md" + images_dir = mineru_output_dir / "images" + + markdown_file.write_text( + "# Test Markdown Content\n\nThis is converted content." + ) + images_dir.mkdir(exist_ok=True) + (images_dir / "figure1.png").write_bytes(b"fake image data") + + try: + # Run conversion + result = converter.convert_paper(sample_metadata) + + # Verify command was called correctly + expected_cmd = [ + "mineru", + "-p", + mock_run.call_args[0][0][2], # PDF path + "-o", + mock_run.call_args[0][0][4], # Output dir + "-b", + "pipeline", + ] + + # Check that mineru was called with correct arguments + actual_cmd = mock_run.call_args[0][0] + assert actual_cmd[0] == "mineru" + assert "-p" in actual_cmd + assert "-o" in actual_cmd + assert "-b" in actual_cmd + assert "pipeline" in actual_cmd + + # Verify conversion was successful + assert result is True + + # Reload metadata and check status + updated_metadata = converter.storage_manager.load_paper_metadata( + sample_metadata.paper_id, sample_metadata.source_type + ) + assert updated_metadata.conversion_status == ConversionStatus.SUCCESS + + finally: + # Cleanup + if temp_output_dir.exists(): + shutil.rmtree(temp_output_dir, ignore_errors=True) + + @patch("subprocess.run") + def test_convert_paper_command_failure(self, mock_run, converter, sample_metadata): + """Test conversion when mineru command fails.""" + # Mock failed mineru command + mock_run.return_value.returncode = 1 + + result = converter.convert_paper(sample_metadata) + + # Verify conversion failed + assert result is False + + # Check metadata was updated with failure status + updated_metadata = converter.storage_manager.load_paper_metadata( + sample_metadata.paper_id, sample_metadata.source_type + ) + assert updated_metadata.conversion_status == ConversionStatus.FAILED + + def test_convert_paper_mineru_unavailable(self, converter, sample_metadata): + """Test conversion when MinerU is not available.""" + # Mock MinerU as unavailable + with patch.object(converter, "is_mineru_available", return_value=False): + result = converter.convert_paper(sample_metadata) + + assert result is False + + def test_convert_paper_missing_pdf(self, converter, storage_manager): + """Test conversion when PDF file is missing.""" + # Create metadata pointing to non-existent PDF + metadata = PaperMetadata( + paper_id="missing-pdf-test", + source_type=SourceType.LOCAL, + title="Missing PDF Test", + pdf_path="nonexistent/path.pdf", + conversion_status=ConversionStatus.PENDING, + ) + + result = converter.convert_paper(metadata) + assert result is False + + def test_convert_all_pending(self, converter, storage_manager): + """Test converting all papers with pending status.""" + # Create sample PDF + pdf_file = Path("./.tmp") / f"batch_test_{hash(self)}.pdf" + with pdf_file.open("wb") as f: + f.write(b"%PDF-1.4\n%%EOF\n") + + try: + # Store multiple papers + papers = [] + for i in range(3): + unique_pdf = Path("./.tmp") / f"batch_{i}_{hash(self)}.pdf" + shutil.copy2(pdf_file, unique_pdf) + + try: + metadata = storage_manager.store_paper( + pdf_path=unique_pdf, + source_type=SourceType.LOCAL, + title=f"Batch Paper {i}", + ) + papers.append(metadata) + finally: + if unique_pdf.exists(): + unique_pdf.unlink() + + # Mock conversions: 2 succeed, 1 fails + with patch.object(converter, "convert_paper") as mock_convert: + mock_convert.side_effect = [True, False, True] + + success_count, failure_count = converter.convert_all_pending() + + assert success_count == 2 + assert failure_count == 1 + assert mock_convert.call_count == 3 + + finally: + if pdf_file.exists(): + pdf_file.unlink()