"""Tests for paperlib PDF converter.""" import shutil from pathlib import Path from unittest.mock import Mock, patch import pytest from paperlib.config import LibraryPaths from paperlib.converter import MinerUConverter from paperlib.models import ConversionStatus, PaperMetadata, SourceType from paperlib.storage import PaperStorageManager class TestMinerUConverter: """Test MinerUConverter functionality.""" @pytest.fixture def temp_library(self): """Create a temporary library for testing.""" temp_dir = Path("./.tmp") / f"test_converter_{hash(self)}" temp_dir.mkdir(parents=True, exist_ok=True) library_paths = LibraryPaths.from_root(temp_dir) library_paths.create_directories() yield library_paths # Cleanup if temp_dir.exists(): shutil.rmtree(temp_dir) @pytest.fixture def storage_manager(self, temp_library): """Create a storage manager for testing.""" return PaperStorageManager(temp_library) @pytest.fixture def converter(self, storage_manager): """Create a MinerUConverter for testing.""" return MinerUConverter(storage_manager) @pytest.fixture def sample_metadata(self, storage_manager): """Create sample paper metadata for testing.""" # Create a sample PDF file pdf_file = Path("./.tmp") / f"test_convert_{hash(self)}.pdf" with pdf_file.open("wb") as f: f.write(b"%PDF-1.4\n") f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n") f.write(b"%%EOF\n") # Store the paper metadata = storage_manager.store_paper( pdf_path=pdf_file, source_type=SourceType.LOCAL, title="Test Paper for Conversion", ) return metadata @patch("subprocess.run") def test_is_mineru_available_cli(self, mock_run, converter): """Test MinerU availability check using CLI.""" # Mock successful mineru --version command mock_run.return_value.returncode = 0 assert converter.is_mineru_available() is True mock_run.assert_called_with( ["mineru", "--version"], capture_output=True, check=False, ) @patch("subprocess.run") def test_is_mineru_available_fallback(self, mock_run, converter): """Test MinerU availability fallback to import check.""" # Mock mineru command not found, but module available mock_run.side_effect = [ Mock(returncode=1), # mineru --version fails Mock(returncode=0), # import mineru succeeds ] assert converter.is_mineru_available() is True assert mock_run.call_count == 2 @patch("subprocess.run") def test_is_mineru_unavailable(self, mock_run, converter): """Test when MinerU is completely unavailable.""" # Mock both command and import failing mock_run.side_effect = [ Mock(returncode=1), # mineru --version fails Mock(returncode=1), # import mineru fails ] assert converter.is_mineru_available() is False @patch("subprocess.run") def test_convert_paper_success(self, mock_run, converter, sample_metadata): """Test successful paper conversion.""" # Mock successful mineru command mock_run.return_value.returncode = 0 # Create expected output structure in temp cache cache_dir = converter.storage_manager.library_paths.cache_dir temp_output_dir = cache_dir / f"mineru_temp_{sample_metadata.paper_id}" pdf_stem = "test_convert_" + str(hash(sample_metadata)) mineru_output_dir = temp_output_dir / pdf_stem mineru_output_dir.mkdir(parents=True, exist_ok=True) # Create expected output files markdown_file = mineru_output_dir / f"{pdf_stem}.md" images_dir = mineru_output_dir / "images" markdown_file.write_text( "# Test Markdown Content\n\nThis is converted content." ) images_dir.mkdir(exist_ok=True) (images_dir / "figure1.png").write_bytes(b"fake image data") try: # Run conversion result = converter.convert_paper(sample_metadata) # Verify command was called correctly expected_cmd = [ "mineru", "-p", mock_run.call_args[0][0][2], # PDF path "-o", mock_run.call_args[0][0][4], # Output dir "-b", "pipeline", ] # Check that mineru was called with correct arguments actual_cmd = mock_run.call_args[0][0] assert actual_cmd[0] == "mineru" assert "-p" in actual_cmd assert "-o" in actual_cmd assert "-b" in actual_cmd assert "pipeline" in actual_cmd # Verify conversion was successful assert result is True # Reload metadata and check status updated_metadata = converter.storage_manager.load_paper_metadata( sample_metadata.paper_id, sample_metadata.source_type ) assert updated_metadata.conversion_status == ConversionStatus.SUCCESS finally: # Cleanup if temp_output_dir.exists(): shutil.rmtree(temp_output_dir, ignore_errors=True) @patch("subprocess.run") def test_convert_paper_command_failure(self, mock_run, converter, sample_metadata): """Test conversion when mineru command fails.""" # Mock failed mineru command mock_run.return_value.returncode = 1 result = converter.convert_paper(sample_metadata) # Verify conversion failed assert result is False # Check metadata was updated with failure status updated_metadata = converter.storage_manager.load_paper_metadata( sample_metadata.paper_id, sample_metadata.source_type ) assert updated_metadata.conversion_status == ConversionStatus.FAILED def test_convert_paper_mineru_unavailable(self, converter, sample_metadata): """Test conversion when MinerU is not available.""" # Mock MinerU as unavailable with patch.object(converter, "is_mineru_available", return_value=False): result = converter.convert_paper(sample_metadata) assert result is False def test_convert_paper_missing_pdf(self, converter, storage_manager): """Test conversion when PDF file is missing.""" # Create metadata pointing to non-existent PDF metadata = PaperMetadata( paper_id="missing-pdf-test", source_type=SourceType.LOCAL, title="Missing PDF Test", pdf_path="nonexistent/path.pdf", conversion_status=ConversionStatus.PENDING, ) result = converter.convert_paper(metadata) assert result is False def test_convert_all_pending(self, converter, storage_manager): """Test converting all papers with pending status.""" # Create sample PDF pdf_file = Path("./.tmp") / f"batch_test_{hash(self)}.pdf" with pdf_file.open("wb") as f: f.write(b"%PDF-1.4\n%%EOF\n") try: # Store multiple papers papers = [] for i in range(3): unique_pdf = Path("./.tmp") / f"batch_{i}_{hash(self)}.pdf" shutil.copy2(pdf_file, unique_pdf) try: metadata = storage_manager.store_paper( pdf_path=unique_pdf, source_type=SourceType.LOCAL, title=f"Batch Paper {i}", ) papers.append(metadata) finally: if unique_pdf.exists(): unique_pdf.unlink() # Mock conversions: 2 succeed, 1 fails with patch.object(converter, "convert_paper") as mock_convert: mock_convert.side_effect = [True, False, True] success_count, failure_count = converter.convert_all_pending() assert success_count == 2 assert failure_count == 1 assert mock_convert.call_count == 3 finally: if pdf_file.exists(): pdf_file.unlink()