220 lines
6.5 KiB
Python
220 lines
6.5 KiB
Python
"""Tests for MinerU markdown post-processing."""
|
|
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from paperlib.config import LibraryPaths
|
|
from paperlib.converter import MinerUConverter
|
|
from paperlib.storage import PaperStorageManager
|
|
|
|
|
|
class TestMinerUPostProcess:
|
|
"""Test MinerU markdown post-processing functionality."""
|
|
|
|
@pytest.fixture
|
|
def temp_library(self):
|
|
"""Create a temporary library for testing."""
|
|
temp_dir = Path("./.tmp") / f"test_postprocess_{hash(self)}"
|
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
library_paths = LibraryPaths.from_root(temp_dir)
|
|
library_paths.create_directories()
|
|
return library_paths
|
|
|
|
@pytest.fixture
|
|
def converter(self, temp_library):
|
|
"""Create a MinerUConverter for testing."""
|
|
storage_manager = PaperStorageManager(temp_library)
|
|
return MinerUConverter(storage_manager)
|
|
|
|
def test_image_reference_replacement(self, converter):
|
|
"""Test that image references are correctly updated."""
|
|
# Create test markdown content with various image reference formats
|
|
test_content = """# Test Document
|
|
|
|
Here's an image with alt text:
|
|

|
|
|
|
Here's an image without alt text:
|
|

|
|
|
|
Some text content.
|
|
|
|
Here's another image:
|
|

|
|
|
|
This should not be changed:
|
|

|
|
|
|
And this local reference should not change:
|
|

|
|
"""
|
|
|
|
expected_content = """# Test Document
|
|
|
|
Here's an image with alt text:
|
|

|
|
|
|
Here's an image without alt text:
|
|

|
|
|
|
Some text content.
|
|
|
|
Here's another image:
|
|

|
|
|
|
This should not be changed:
|
|

|
|
|
|
And this local reference should not change:
|
|

|
|
"""
|
|
|
|
# Create temporary file
|
|
with tempfile.NamedTemporaryFile(
|
|
mode="w", suffix=".md", delete=False, encoding="utf-8"
|
|
) as tmp:
|
|
tmp.write(test_content)
|
|
tmp_path = Path(tmp.name)
|
|
|
|
try:
|
|
# Apply post-processing
|
|
converter._post_process_markdown(tmp_path)
|
|
|
|
# Read the result
|
|
result_content = tmp_path.read_text(encoding="utf-8")
|
|
|
|
# Verify image references were updated correctly
|
|
assert "" in result_content
|
|
assert (
|
|
""
|
|
in result_content
|
|
)
|
|
|
|
# Verify external and local references were NOT changed
|
|
assert "https://example.com/image.jpg" in result_content
|
|
assert "./local_images/test.png" in result_content
|
|
|
|
# Verify no "images/" references remain
|
|
assert "](images/" not in result_content
|
|
|
|
finally:
|
|
if tmp_path.exists():
|
|
tmp_path.unlink()
|
|
|
|
def test_markdown_content_cleaning(self, converter):
|
|
"""Test markdown content cleaning functionality."""
|
|
test_content = """# Title with Extra Spaces
|
|
|
|
|
|
Here's a paragraph with multiple spaces.
|
|
|
|
Indented line with tabs and spaces.
|
|
|
|
|
|
Another paragraph.
|
|
|
|
|
|
|
|
Too many blank lines above.
|
|
"""
|
|
|
|
expected_cleaned = """# Title with Extra Spaces
|
|
|
|
|
|
Here's a paragraph with multiple spaces.
|
|
|
|
Indented line with tabs and spaces.
|
|
|
|
|
|
Another paragraph.
|
|
|
|
|
|
|
|
Too many blank lines above.
|
|
"""
|
|
|
|
result = converter._clean_markdown_content(test_content)
|
|
|
|
# Check that excessive whitespace within lines is cleaned
|
|
lines = result.split("\n")
|
|
for line in lines:
|
|
if line.strip(): # Non-empty lines
|
|
# Should not have multiple consecutive spaces
|
|
assert " " not in line or line.startswith(
|
|
" "
|
|
) # Except for code blocks
|
|
|
|
def test_post_process_error_handling(self, converter):
|
|
"""Test that post-processing errors don't crash conversion."""
|
|
# Test with non-existent file
|
|
fake_path = Path("./.tmp/nonexistent.md")
|
|
|
|
# Should not raise exception
|
|
converter._post_process_markdown(fake_path)
|
|
|
|
# Test with unreadable file (permission issue simulation)
|
|
with tempfile.NamedTemporaryFile(suffix=".md", delete=False) as tmp:
|
|
tmp_path = Path(tmp.name)
|
|
|
|
try:
|
|
# Create file then make it unreadable by removing it
|
|
tmp_path.unlink()
|
|
|
|
# Should handle gracefully
|
|
converter._post_process_markdown(tmp_path)
|
|
|
|
finally:
|
|
# Cleanup if file somehow still exists
|
|
if tmp_path.exists():
|
|
tmp_path.unlink()
|
|
|
|
def test_complex_image_patterns(self, converter):
|
|
"""Test complex image reference patterns."""
|
|
test_content = """
|
|
Various image patterns:
|
|
|
|

|
|

|
|

|
|

|
|
.jpg)
|
|

|
|
|
|
Non-image patterns that should not change:
|
|
[Link text](images/not-an-image)
|
|
`code with images/path`
|
|
code block with images/reference
|
|
"""
|
|
|
|
with tempfile.NamedTemporaryFile(
|
|
mode="w", suffix=".md", delete=False, encoding="utf-8"
|
|
) as tmp:
|
|
tmp.write(test_content)
|
|
tmp_path = Path(tmp.name)
|
|
|
|
try:
|
|
converter._post_process_markdown(tmp_path)
|
|
result = tmp_path.read_text(encoding="utf-8")
|
|
|
|
# Verify all image references were updated
|
|
assert "" in result
|
|
assert "" in result
|
|
assert "" in result
|
|
assert "" in result
|
|
assert ".jpg)" in result
|
|
|
|
# Verify non-image patterns were preserved
|
|
assert "[Link text](images/not-an-image)" in result
|
|
assert "`code with images/path`" in result
|
|
assert (
|
|
"code block with images/reference" in result
|
|
) # Leading spaces may be removed by cleaning
|
|
|
|
finally:
|
|
if tmp_path.exists():
|
|
tmp_path.unlink()
|