"""Tests for paperlib import functionality.""" import shutil from pathlib import Path from unittest.mock import Mock, patch import pytest from paperlib.config import LibraryPaths from paperlib.importer import ArxivImporter, LocalImporter from paperlib.models import SourceType from paperlib.storage import PaperStorageManager class TestLocalImporter: """Test LocalImporter functionality.""" @pytest.fixture def temp_library(self): """Create a temporary library for testing.""" temp_dir = Path("./.tmp") / f"test_import_{hash(self)}" temp_dir.mkdir(parents=True, exist_ok=True) library_paths = LibraryPaths.from_root(temp_dir) library_paths.create_directories() yield library_paths # Cleanup if temp_dir.exists(): shutil.rmtree(temp_dir) @pytest.fixture def local_importer(self, temp_library): """Create a LocalImporter for testing.""" storage_manager = PaperStorageManager(temp_library) return LocalImporter(storage_manager) @pytest.fixture def sample_pdf(self): """Create a sample PDF file for testing.""" pdf_file = Path("./.tmp") / f"sample_{hash(self)}.pdf" with pdf_file.open("wb") as f: # Minimal PDF content f.write(b"%PDF-1.4\n") f.write(b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n") f.write(b"%%EOF\n") yield pdf_file # Cleanup if pdf_file.exists(): pdf_file.unlink() def test_import_pdf_success(self, local_importer, sample_pdf): """Test successful PDF import.""" metadata = local_importer.import_pdf( pdf_path=sample_pdf, title="Test Paper", notes="Test notes", tags=["test", "sample"], ) # Check metadata assert metadata.source_type == SourceType.LOCAL assert metadata.title == "Test Paper" assert metadata.notes == "Test notes" assert metadata.tags == ["test", "sample"] assert metadata.paper_id.startswith("local-") def test_import_pdf_auto_title(self, local_importer, sample_pdf): """Test PDF import with auto-generated title.""" # Rename PDF to have a meaningful name meaningful_pdf = sample_pdf.parent / "Machine_Learning-Paper.pdf" sample_pdf.rename(meaningful_pdf) try: metadata = local_importer.import_pdf(pdf_path=meaningful_pdf) # Title should be auto-generated from filename assert metadata.title == "Machine Learning Paper" finally: if meaningful_pdf.exists(): meaningful_pdf.unlink() def test_import_nonexistent_pdf(self, local_importer): """Test importing non-existent PDF file.""" nonexistent = Path("./.tmp/nonexistent.pdf") with pytest.raises(FileNotFoundError): local_importer.import_pdf(pdf_path=nonexistent) def test_import_non_pdf_file(self, local_importer): """Test importing non-PDF file.""" text_file = Path("./.tmp") / "not_a_pdf.txt" with text_file.open("w") as f: f.write("This is not a PDF") try: with pytest.raises(ValueError, match="File is not a PDF"): local_importer.import_pdf(pdf_path=text_file) finally: if text_file.exists(): text_file.unlink() def test_import_duplicate_pdf(self, local_importer, sample_pdf): """Test importing the same PDF twice.""" # Import once local_importer.import_pdf(pdf_path=sample_pdf) # Try to import again with pytest.raises(ValueError, match="Paper already imported"): local_importer.import_pdf(pdf_path=sample_pdf) class TestArxivImporter: """Test ArxivImporter functionality.""" @pytest.fixture def temp_library(self): """Create a temporary library for testing.""" temp_dir = Path("./.tmp") / f"test_arxiv_{hash(self)}" temp_dir.mkdir(parents=True, exist_ok=True) library_paths = LibraryPaths.from_root(temp_dir) library_paths.create_directories() yield library_paths # Cleanup if temp_dir.exists(): shutil.rmtree(temp_dir) @pytest.fixture def arxiv_importer(self, temp_library): """Create an ArxivImporter for testing.""" storage_manager = PaperStorageManager(temp_library) return ArxivImporter(storage_manager) def test_extract_arxiv_id_clean(self, arxiv_importer): """Test extracting clean arXiv ID.""" # Test various formats assert arxiv_importer.extract_arxiv_id("2212.06340") == "2212.06340" assert arxiv_importer.extract_arxiv_id("arxiv:2212.06340") == "2212.06340" assert arxiv_importer.extract_arxiv_id("2212.06340v1") == "2212.06340v1" assert arxiv_importer.extract_arxiv_id("math-ph/0701002") == "math-ph/0701002" def test_extract_arxiv_id_from_url(self, arxiv_importer): """Test extracting arXiv ID from URLs.""" url = "https://arxiv.org/abs/2212.06340" extracted = arxiv_importer.extract_arxiv_id(url) assert extracted == "2212.06340" def test_fetch_paper_metadata_success(self, arxiv_importer): """Test successful metadata fetching from arXiv.""" # Mock arXiv result mock_result = Mock() mock_result.title = "Test Paper" mock_result.authors = [Mock(name="Alice Smith"), Mock(name="Bob Jones")] mock_result.published = Mock() mock_result.updated = Mock() mock_result.categories = ["cs.AI", "stat.ML"] # Mock the client's results method directly arxiv_importer.client.results = Mock(return_value=[mock_result]) # Test result = arxiv_importer.fetch_paper_metadata("2212.06340") assert result == mock_result def test_fetch_paper_metadata_not_found(self, arxiv_importer): """Test fetching metadata for non-existent paper.""" # Mock empty results arxiv_importer.client.results = Mock(return_value=[]) with pytest.raises(ValueError, match="Paper not found on arXiv"): arxiv_importer.fetch_paper_metadata("9999.99999") @patch("paperlib.importer.arxiv_importer.tempfile.NamedTemporaryFile") def test_download_pdf(self, mock_tempfile, arxiv_importer): """Test PDF downloading.""" # Mock temporary file mock_temp_path = Path("./.tmp/mock_temp.pdf") mock_tempfile.return_value.__enter__.return_value.name = str(mock_temp_path) # Mock arXiv result mock_result = Mock() # Create actual temp file for test with mock_temp_path.open("wb") as f: f.write(b"%PDF-1.4\n%%EOF\n") try: pdf_path = arxiv_importer.download_pdf(mock_result) assert pdf_path == mock_temp_path mock_result.download_pdf.assert_called_once_with( filename=str(mock_temp_path) ) finally: if mock_temp_path.exists(): mock_temp_path.unlink() @patch.object(ArxivImporter, "download_pdf") @patch.object(ArxivImporter, "fetch_paper_metadata") def test_import_arxiv_paper_success( self, mock_fetch, mock_download, arxiv_importer ): """Test successful arXiv paper import.""" # Mock PDF file pdf_file = Path("./.tmp") / "test_arxiv.pdf" with pdf_file.open("wb") as f: f.write(b"%PDF-1.4\n%%EOF\n") try: # Mock arXiv result with proper string values mock_author = Mock() mock_author.name = "Alice Smith" mock_result = Mock() mock_result.title = "Test ArXiv Paper" mock_result.authors = [mock_author] mock_result.published = None mock_result.updated = None mock_result.categories = ["cs.AI"] mock_fetch.return_value = mock_result mock_download.return_value = pdf_file # Test import metadata = arxiv_importer.import_arxiv_paper( arxiv_input="2212.06340", notes="Test notes", tags=["test"] ) # Check results assert metadata.source_type == SourceType.ARXIV assert metadata.source_id == "2212.06340" assert metadata.title == "Test ArXiv Paper" assert metadata.authors == ["Alice Smith"] assert metadata.categories == ["cs.AI"] assert metadata.notes == "Test notes" assert metadata.tags == ["test"] finally: if pdf_file.exists(): pdf_file.unlink() @patch.object(ArxivImporter, "fetch_paper_metadata") def test_import_duplicate_arxiv_paper(self, mock_fetch, arxiv_importer): """Test importing the same arXiv paper twice.""" # Mock first import pdf_file = Path("./.tmp") / "test_arxiv_dup.pdf" with pdf_file.open("wb") as f: f.write(b"%PDF-1.4\n%%EOF\n") try: with patch.object(ArxivImporter, "download_pdf", return_value=pdf_file): mock_result = Mock() mock_result.title = "Test Paper" mock_result.authors = [] mock_result.published = None mock_result.updated = None mock_result.categories = [] mock_fetch.return_value = mock_result # First import should succeed arxiv_importer.import_arxiv_paper("2212.06340") # Second import should fail with pytest.raises(ValueError, match="Paper already imported"): arxiv_importer.import_arxiv_paper("2212.06340") finally: if pdf_file.exists(): pdf_file.unlink()