274 lines
9.7 KiB
Python
274 lines
9.7 KiB
Python
"""Tests for paperlib import functionality."""
|
|
|
|
import shutil
|
|
from pathlib import Path
|
|
from unittest.mock import Mock, patch
|
|
|
|
import pytest
|
|
|
|
from paperlib.config import LibraryPaths
|
|
from paperlib.importer import ArxivImporter, LocalImporter
|
|
from paperlib.models import SourceType
|
|
from paperlib.storage import PaperStorageManager
|
|
|
|
|
|
class TestLocalImporter:
|
|
"""Test LocalImporter functionality."""
|
|
|
|
@pytest.fixture
|
|
def temp_library(self):
|
|
"""Create a temporary library for testing."""
|
|
temp_dir = Path("./.tmp") / f"test_import_{hash(self)}"
|
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
library_paths = LibraryPaths.from_root(temp_dir)
|
|
library_paths.create_directories()
|
|
|
|
yield library_paths
|
|
|
|
# Cleanup
|
|
if temp_dir.exists():
|
|
shutil.rmtree(temp_dir)
|
|
|
|
@pytest.fixture
|
|
def local_importer(self, temp_library):
|
|
"""Create a LocalImporter for testing."""
|
|
storage_manager = PaperStorageManager(temp_library)
|
|
return LocalImporter(storage_manager)
|
|
|
|
@pytest.fixture
|
|
def sample_pdf(self):
|
|
"""Create a sample PDF file for testing."""
|
|
pdf_file = Path("./.tmp") / f"sample_{hash(self)}.pdf"
|
|
with pdf_file.open("wb") as f:
|
|
# Minimal PDF content
|
|
f.write(b"%PDF-1.4\n")
|
|
f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n")
|
|
f.write(b"%%EOF\n")
|
|
|
|
yield pdf_file
|
|
|
|
# Cleanup
|
|
if pdf_file.exists():
|
|
pdf_file.unlink()
|
|
|
|
def test_import_pdf_success(self, local_importer, sample_pdf):
|
|
"""Test successful PDF import."""
|
|
metadata = local_importer.import_pdf(
|
|
pdf_path=sample_pdf,
|
|
title="Test Paper",
|
|
notes="Test notes",
|
|
tags=["test", "sample"],
|
|
)
|
|
|
|
# Check metadata
|
|
assert metadata.source_type == SourceType.LOCAL
|
|
assert metadata.title == "Test Paper"
|
|
assert metadata.notes == "Test notes"
|
|
assert metadata.tags == ["test", "sample"]
|
|
assert metadata.paper_id.startswith("local-")
|
|
|
|
def test_import_pdf_auto_title(self, local_importer, sample_pdf):
|
|
"""Test PDF import with auto-generated title."""
|
|
# Rename PDF to have a meaningful name
|
|
meaningful_pdf = sample_pdf.parent / "Machine_Learning-Paper.pdf"
|
|
sample_pdf.rename(meaningful_pdf)
|
|
|
|
try:
|
|
metadata = local_importer.import_pdf(pdf_path=meaningful_pdf)
|
|
|
|
# Title should be auto-generated from filename
|
|
assert metadata.title == "Machine Learning Paper"
|
|
|
|
finally:
|
|
if meaningful_pdf.exists():
|
|
meaningful_pdf.unlink()
|
|
|
|
def test_import_nonexistent_pdf(self, local_importer):
|
|
"""Test importing non-existent PDF file."""
|
|
nonexistent = Path("./.tmp/nonexistent.pdf")
|
|
|
|
with pytest.raises(FileNotFoundError):
|
|
local_importer.import_pdf(pdf_path=nonexistent)
|
|
|
|
def test_import_non_pdf_file(self, local_importer):
|
|
"""Test importing non-PDF file."""
|
|
text_file = Path("./.tmp") / "not_a_pdf.txt"
|
|
with text_file.open("w") as f:
|
|
f.write("This is not a PDF")
|
|
|
|
try:
|
|
with pytest.raises(ValueError, match="File is not a PDF"):
|
|
local_importer.import_pdf(pdf_path=text_file)
|
|
finally:
|
|
if text_file.exists():
|
|
text_file.unlink()
|
|
|
|
def test_import_duplicate_pdf(self, local_importer, sample_pdf):
|
|
"""Test importing the same PDF twice."""
|
|
# Import once
|
|
metadata1 = local_importer.import_pdf(pdf_path=sample_pdf)
|
|
|
|
# Try to import again
|
|
with pytest.raises(ValueError, match="Paper already imported"):
|
|
local_importer.import_pdf(pdf_path=sample_pdf)
|
|
|
|
|
|
class TestArxivImporter:
|
|
"""Test ArxivImporter functionality."""
|
|
|
|
@pytest.fixture
|
|
def temp_library(self):
|
|
"""Create a temporary library for testing."""
|
|
temp_dir = Path("./.tmp") / f"test_arxiv_{hash(self)}"
|
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
library_paths = LibraryPaths.from_root(temp_dir)
|
|
library_paths.create_directories()
|
|
|
|
yield library_paths
|
|
|
|
# Cleanup
|
|
if temp_dir.exists():
|
|
shutil.rmtree(temp_dir)
|
|
|
|
@pytest.fixture
|
|
def arxiv_importer(self, temp_library):
|
|
"""Create an ArxivImporter for testing."""
|
|
storage_manager = PaperStorageManager(temp_library)
|
|
return ArxivImporter(storage_manager)
|
|
|
|
def test_extract_arxiv_id_clean(self, arxiv_importer):
|
|
"""Test extracting clean arXiv ID."""
|
|
# Test various formats
|
|
assert arxiv_importer.extract_arxiv_id("2212.06340") == "2212.06340"
|
|
assert arxiv_importer.extract_arxiv_id("arxiv:2212.06340") == "2212.06340"
|
|
assert arxiv_importer.extract_arxiv_id("2212.06340v1") == "2212.06340v1"
|
|
assert arxiv_importer.extract_arxiv_id("math-ph/0701002") == "math-ph/0701002"
|
|
|
|
def test_extract_arxiv_id_from_url(self, arxiv_importer):
|
|
"""Test extracting arXiv ID from URLs."""
|
|
url = "https://arxiv.org/abs/2212.06340"
|
|
extracted = arxiv_importer.extract_arxiv_id(url)
|
|
assert extracted == "2212.06340"
|
|
|
|
def test_fetch_paper_metadata_success(self, arxiv_importer):
|
|
"""Test successful metadata fetching from arXiv."""
|
|
# Mock arXiv result
|
|
mock_result = Mock()
|
|
mock_result.title = "Test Paper"
|
|
mock_result.authors = [Mock(name="Alice Smith"), Mock(name="Bob Jones")]
|
|
mock_result.published = Mock()
|
|
mock_result.updated = Mock()
|
|
mock_result.categories = ["cs.AI", "stat.ML"]
|
|
|
|
# Mock the client's results method directly
|
|
arxiv_importer.client.results = Mock(return_value=[mock_result])
|
|
|
|
# Test
|
|
result = arxiv_importer.fetch_paper_metadata("2212.06340")
|
|
assert result == mock_result
|
|
|
|
def test_fetch_paper_metadata_not_found(self, arxiv_importer):
|
|
"""Test fetching metadata for non-existent paper."""
|
|
# Mock empty results
|
|
arxiv_importer.client.results = Mock(return_value=[])
|
|
|
|
with pytest.raises(ValueError, match="Paper not found on arXiv"):
|
|
arxiv_importer.fetch_paper_metadata("9999.99999")
|
|
|
|
@patch("paperlib.importer.arxiv_importer.tempfile.NamedTemporaryFile")
|
|
def test_download_pdf(self, mock_tempfile, arxiv_importer):
|
|
"""Test PDF downloading."""
|
|
# Mock temporary file
|
|
mock_temp_path = Path("./.tmp/mock_temp.pdf")
|
|
mock_tempfile.return_value.__enter__.return_value.name = str(mock_temp_path)
|
|
|
|
# Mock arXiv result
|
|
mock_result = Mock()
|
|
|
|
# Create actual temp file for test
|
|
with mock_temp_path.open("wb") as f:
|
|
f.write(b"%PDF-1.4\n%%EOF\n")
|
|
|
|
try:
|
|
pdf_path = arxiv_importer.download_pdf(mock_result)
|
|
assert pdf_path == mock_temp_path
|
|
mock_result.download_pdf.assert_called_once_with(
|
|
filename=str(mock_temp_path)
|
|
)
|
|
finally:
|
|
if mock_temp_path.exists():
|
|
mock_temp_path.unlink()
|
|
|
|
@patch.object(ArxivImporter, "download_pdf")
|
|
@patch.object(ArxivImporter, "fetch_paper_metadata")
|
|
def test_import_arxiv_paper_success(
|
|
self, mock_fetch, mock_download, arxiv_importer
|
|
):
|
|
"""Test successful arXiv paper import."""
|
|
# Mock PDF file
|
|
pdf_file = Path("./.tmp") / "test_arxiv.pdf"
|
|
with pdf_file.open("wb") as f:
|
|
f.write(b"%PDF-1.4\n%%EOF\n")
|
|
|
|
try:
|
|
# Mock arXiv result with proper string values
|
|
mock_author = Mock()
|
|
mock_author.name = "Alice Smith"
|
|
|
|
mock_result = Mock()
|
|
mock_result.title = "Test ArXiv Paper"
|
|
mock_result.authors = [mock_author]
|
|
mock_result.published = None
|
|
mock_result.updated = None
|
|
mock_result.categories = ["cs.AI"]
|
|
|
|
mock_fetch.return_value = mock_result
|
|
mock_download.return_value = pdf_file
|
|
|
|
# Test import
|
|
metadata = arxiv_importer.import_arxiv_paper(
|
|
arxiv_input="2212.06340", notes="Test notes", tags=["test"]
|
|
)
|
|
|
|
# Check results
|
|
assert metadata.source_type == SourceType.ARXIV
|
|
assert metadata.source_id == "2212.06340"
|
|
assert metadata.title == "Test ArXiv Paper"
|
|
assert metadata.authors == ["Alice Smith"]
|
|
assert metadata.categories == ["cs.AI"]
|
|
assert metadata.notes == "Test notes"
|
|
assert metadata.tags == ["test"]
|
|
|
|
finally:
|
|
if pdf_file.exists():
|
|
pdf_file.unlink()
|
|
|
|
@patch.object(ArxivImporter, "fetch_paper_metadata")
|
|
def test_import_duplicate_arxiv_paper(self, mock_fetch, arxiv_importer):
|
|
"""Test importing the same arXiv paper twice."""
|
|
# Mock first import
|
|
pdf_file = Path("./.tmp") / "test_arxiv_dup.pdf"
|
|
with pdf_file.open("wb") as f:
|
|
f.write(b"%PDF-1.4\n%%EOF\n")
|
|
|
|
try:
|
|
with patch.object(ArxivImporter, "download_pdf", return_value=pdf_file):
|
|
mock_result = Mock()
|
|
mock_result.title = "Test Paper"
|
|
mock_result.authors = []
|
|
mock_result.published = None
|
|
mock_result.updated = None
|
|
mock_result.categories = []
|
|
mock_fetch.return_value = mock_result
|
|
|
|
# First import should succeed
|
|
arxiv_importer.import_arxiv_paper("2212.06340")
|
|
|
|
# Second import should fail
|
|
with pytest.raises(ValueError, match="Paper already imported"):
|
|
arxiv_importer.import_arxiv_paper("2212.06340")
|
|
|
|
finally:
|
|
if pdf_file.exists():
|
|
pdf_file.unlink()
|