"""Tests for MinerU markdown post-processing.""" import tempfile from pathlib import Path import pytest from paperlib.config import LibraryPaths from paperlib.converter import MinerUConverter from paperlib.storage import PaperStorageManager class TestMinerUPostProcess: """Test MinerU markdown post-processing functionality.""" @pytest.fixture def temp_library(self): """Create a temporary library for testing.""" temp_dir = Path("./.tmp") / f"test_postprocess_{hash(self)}" temp_dir.mkdir(parents=True, exist_ok=True) library_paths = LibraryPaths.from_root(temp_dir) library_paths.create_directories() return library_paths @pytest.fixture def converter(self, temp_library): """Create a MinerUConverter for testing.""" storage_manager = PaperStorageManager(temp_library) return MinerUConverter(storage_manager) def test_image_reference_replacement(self, converter): """Test that image references are correctly updated.""" # Create test markdown content with various image reference formats test_content = """# Test Document Here's an image with alt text: ![Figure 1](images/03781efbc8005e66728b733052e050ccbd581e5079942e5ab8e4c3020e53540d.jpg) Here's an image without alt text: ![](images/another_image.png) Some text content. Here's another image: ![Complex alt text with spaces](images/subfolder/image.svg) This should not be changed: ![External image](https://example.com/image.jpg) And this local reference should not change: ![Local ref](./local_images/test.png) """ expected_content = """# Test Document Here's an image with alt text: ![Figure 1](assets/03781efbc8005e66728b733052e050ccbd581e5079942e5ab8e4c3020e53540d.jpg) Here's an image without alt text: ![](assets/another_image.png) Some text content. Here's another image: ![Complex alt text with spaces](assets/subfolder/image.svg) This should not be changed: ![External image](https://example.com/image.jpg) And this local reference should not change: ![Local ref](./local_images/test.png) """ # Create temporary file with tempfile.NamedTemporaryFile( mode="w", suffix=".md", delete=False, encoding="utf-8" ) as tmp: tmp.write(test_content) tmp_path = Path(tmp.name) try: # Apply post-processing converter._post_process_markdown(tmp_path) # Read the result result_content = tmp_path.read_text(encoding="utf-8") # Verify image references were updated correctly assert "![Figure 1](assets/" in result_content assert "![](assets/another_image.png)" in result_content assert ( "![Complex alt text with spaces](assets/subfolder/image.svg)" in result_content ) # Verify external and local references were NOT changed assert "https://example.com/image.jpg" in result_content assert "./local_images/test.png" in result_content # Verify no "images/" references remain assert "](images/" not in result_content finally: if tmp_path.exists(): tmp_path.unlink() def test_markdown_content_cleaning(self, converter): """Test markdown content cleaning functionality.""" test_content = """# Title with Extra Spaces Here's a paragraph with multiple spaces. Indented line with tabs and spaces. Another paragraph. Too many blank lines above. """ expected_cleaned = """# Title with Extra Spaces Here's a paragraph with multiple spaces. Indented line with tabs and spaces. Another paragraph. Too many blank lines above. """ result = converter._clean_markdown_content(test_content) # Check that excessive whitespace within lines is cleaned lines = result.split("\n") for line in lines: if line.strip(): # Non-empty lines # Should not have multiple consecutive spaces assert " " not in line or line.startswith( " " ) # Except for code blocks def test_post_process_error_handling(self, converter): """Test that post-processing errors don't crash conversion.""" # Test with non-existent file fake_path = Path("./.tmp/nonexistent.md") # Should not raise exception converter._post_process_markdown(fake_path) # Test with unreadable file (permission issue simulation) with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp: tmp_path = Path(tmp.name) try: # Create file then make it unreadable by removing it tmp_path.unlink() # Should handle gracefully converter._post_process_markdown(tmp_path) finally: # Cleanup if file somehow still exists if tmp_path.exists(): tmp_path.unlink() def test_complex_image_patterns(self, converter): """Test complex image reference patterns.""" test_content = """ Various image patterns: ![](images/simple.jpg) ![Alt](images/with-dashes.png) ![Alt text](images/under_scores.svg) ![](images/path/with/subdirs.gif) ![Caption with (parentheses)](images/weird-name(1).jpg) ![Multi line alt](images/multiline.png) Non-image patterns that should not change: [Link text](images/not-an-image) `code with images/path` code block with images/reference """ with tempfile.NamedTemporaryFile( mode="w", suffix=".md", delete=False, encoding="utf-8" ) as tmp: tmp.write(test_content) tmp_path = Path(tmp.name) try: converter._post_process_markdown(tmp_path) result = tmp_path.read_text(encoding="utf-8") # Verify all image references were updated assert "![](assets/simple.jpg)" in result assert "![Alt](assets/with-dashes.png)" in result assert "![Alt text](assets/under_scores.svg)" in result assert "![](assets/path/with/subdirs.gif)" in result assert "![Caption with (parentheses)](assets/weird-name(1).jpg)" in result # Verify non-image patterns were preserved assert "[Link text](images/not-an-image)" in result assert "`code with images/path`" in result assert ( "code block with images/reference" in result ) # Leading spaces may be removed by cleaning finally: if tmp_path.exists(): tmp_path.unlink()