Files
paperlib/tests/test_mineru_postprocess.py
T
2026-04-17 19:15:31 -04:00

220 lines
6.5 KiB
Python

"""Tests for MinerU markdown post-processing."""
import tempfile
from pathlib import Path
import pytest
from paperlib.config import LibraryPaths
from paperlib.converter import MinerUConverter
from paperlib.storage import PaperStorageManager
class TestMinerUPostProcess:
"""Test MinerU markdown post-processing functionality."""
@pytest.fixture
def temp_library(self):
"""Create a temporary library for testing."""
temp_dir = Path("./.tmp") / f"test_postprocess_{hash(self)}"
temp_dir.mkdir(parents=True, exist_ok=True)
library_paths = LibraryPaths.from_root(temp_dir)
library_paths.create_directories()
return library_paths
@pytest.fixture
def converter(self, temp_library):
"""Create a MinerUConverter for testing."""
storage_manager = PaperStorageManager(temp_library)
return MinerUConverter(storage_manager)
def test_image_reference_replacement(self, converter):
"""Test that image references are correctly updated."""
# Create test markdown content with various image reference formats
test_content = """# Test Document
Here's an image with alt text:
![Figure 1](images/03781efbc8005e66728b733052e050ccbd581e5079942e5ab8e4c3020e53540d.jpg)
Here's an image without alt text:
![](images/another_image.png)
Some text content.
Here's another image:
![Complex alt text with spaces](images/subfolder/image.svg)
This should not be changed:
![External image](https://example.com/image.jpg)
And this local reference should not change:
![Local ref](./local_images/test.png)
"""
expected_content = """# Test Document
Here's an image with alt text:
![Figure 1](assets/03781efbc8005e66728b733052e050ccbd581e5079942e5ab8e4c3020e53540d.jpg)
Here's an image without alt text:
![](assets/another_image.png)
Some text content.
Here's another image:
![Complex alt text with spaces](assets/subfolder/image.svg)
This should not be changed:
![External image](https://example.com/image.jpg)
And this local reference should not change:
![Local ref](./local_images/test.png)
"""
# Create temporary file
with tempfile.NamedTemporaryFile(
mode="w", suffix=".md", delete=False, encoding="utf-8"
) as tmp:
tmp.write(test_content)
tmp_path = Path(tmp.name)
try:
# Apply post-processing
converter._post_process_markdown(tmp_path)
# Read the result
result_content = tmp_path.read_text(encoding="utf-8")
# Verify image references were updated correctly
assert "![Figure 1](assets/" in result_content
assert "![](assets/another_image.png)" in result_content
assert (
"![Complex alt text with spaces](assets/subfolder/image.svg)"
in result_content
)
# Verify external and local references were NOT changed
assert "https://example.com/image.jpg" in result_content
assert "./local_images/test.png" in result_content
# Verify no "images/" references remain
assert "](images/" not in result_content
finally:
if tmp_path.exists():
tmp_path.unlink()
def test_markdown_content_cleaning(self, converter):
"""Test markdown content cleaning functionality."""
test_content = """# Title with Extra Spaces
Here's a paragraph with multiple spaces.
Indented line with tabs and spaces.
Another paragraph.
Too many blank lines above.
"""
expected_cleaned = """# Title with Extra Spaces
Here's a paragraph with multiple spaces.
Indented line with tabs and spaces.
Another paragraph.
Too many blank lines above.
"""
result = converter._clean_markdown_content(test_content)
# Check that excessive whitespace within lines is cleaned
lines = result.split("\n")
for line in lines:
if line.strip(): # Non-empty lines
# Should not have multiple consecutive spaces
assert " " not in line or line.startswith(
" "
) # Except for code blocks
def test_post_process_error_handling(self, converter):
"""Test that post-processing errors don't crash conversion."""
# Test with non-existent file
fake_path = Path("./.tmp/nonexistent.md")
# Should not raise exception
converter._post_process_markdown(fake_path)
# Test with unreadable file (permission issue simulation)
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp:
tmp_path = Path(tmp.name)
try:
# Create file then make it unreadable by removing it
tmp_path.unlink()
# Should handle gracefully
converter._post_process_markdown(tmp_path)
finally:
# Cleanup if file somehow still exists
if tmp_path.exists():
tmp_path.unlink()
def test_complex_image_patterns(self, converter):
"""Test complex image reference patterns."""
test_content = """
Various image patterns:
![](images/simple.jpg)
![Alt](images/with-dashes.png)
![Alt text](images/under_scores.svg)
![](images/path/with/subdirs.gif)
![Caption with (parentheses)](images/weird-name(1).jpg)
![Multi
line alt](images/multiline.png)
Non-image patterns that should not change:
[Link text](images/not-an-image)
`code with images/path`
code block with images/reference
"""
with tempfile.NamedTemporaryFile(
mode="w", suffix=".md", delete=False, encoding="utf-8"
) as tmp:
tmp.write(test_content)
tmp_path = Path(tmp.name)
try:
converter._post_process_markdown(tmp_path)
result = tmp_path.read_text(encoding="utf-8")
# Verify all image references were updated
assert "![](assets/simple.jpg)" in result
assert "![Alt](assets/with-dashes.png)" in result
assert "![Alt text](assets/under_scores.svg)" in result
assert "![](assets/path/with/subdirs.gif)" in result
assert "![Caption with (parentheses)](assets/weird-name(1).jpg)" in result
# Verify non-image patterns were preserved
assert "[Link text](images/not-an-image)" in result
assert "`code with images/path`" in result
assert (
"code block with images/reference" in result
) # Leading spaces may be removed by cleaning
finally:
if tmp_path.exists():
tmp_path.unlink()