From 174801242d7bc35caa4ca146c4eb1e48fe28c3e7 Mon Sep 17 00:00:00 2001 From: Yingjie Wang Date: Fri, 17 Apr 2026 17:03:59 -0400 Subject: [PATCH] fix: arxiv year --- docs/storage-layout.md | 13 ++- src/paperlib/storage/manager.py | 11 ++- tests/test_arxiv_year_fix.py | 139 ++++++++++++++++++++++++++++++++ tests/test_storage.py | 3 +- 4 files changed, 158 insertions(+), 8 deletions(-) create mode 100644 tests/test_arxiv_year_fix.py diff --git a/docs/storage-layout.md b/docs/storage-layout.md index eaeb024..35ea699 100644 --- a/docs/storage-layout.md +++ b/docs/storage-layout.md @@ -53,16 +53,21 @@ papers/arxiv/YEAR/arxiv-NORMALIZED_ID/ ``` Where: -- `YEAR` is extracted from the arXiv ID (e.g., `2212.06340` → `2022`) +- `YEAR` is extracted from the arXiv ID (e.g., `2212.06340` → `2022`, `0001.12345` → `2000`) - `NORMALIZED_ID` replaces dots and version numbers with underscores - `2212.06340` → `arxiv-2212_06340` - `2212.06340v2` → `arxiv-2212_06340v2` +The year extraction follows arXiv's YYMM.NNNNN format: +- Years 00-89 map to 2000-2089 +- Years 90-99 map to 1990-1999 + **Examples:** ``` -papers/arxiv/2022/arxiv-2212_06340/ -papers/arxiv/2023/arxiv-2301_12345v1/ -papers/arxiv/2024/arxiv-2405_98765/ +papers/arxiv/2022/arxiv-2212_06340/ # 2212.06340 -> year 2022 +papers/arxiv/2023/arxiv-2301_12345v1/ # 2301.12345v1 -> year 2023 +papers/arxiv/2000/arxiv-0001_98765/ # 0001.98765 -> year 2000 +papers/arxiv/1999/arxiv-9912_12345/ # 9912.12345 -> year 1999 ``` ### Local Papers diff --git a/src/paperlib/storage/manager.py b/src/paperlib/storage/manager.py index da62027..e630ed4 100644 --- a/src/paperlib/storage/manager.py +++ b/src/paperlib/storage/manager.py @@ -43,10 +43,15 @@ class PaperStorageManager: if source_type == SourceType.ARXIV: # Extract year from arXiv ID pattern (e.g., "2212.06340" -> "2022") arxiv_id = paper_id.replace("arxiv-", "").replace("_", ".") - year_part = arxiv_id[:4] + year_part = arxiv_id[:2] # Get YY part # Modern arXiv format: YYMM.NNNNN - if len(year_part) == 4 and year_part.isdigit(): - year = year_part + if len(year_part) == 2 and year_part.isdigit(): + # Convert 2-digit year to 4-digit year + yy = int(year_part) + if yy >= 90: # 90-99 maps to 1990-1999 + year = str(1900 + yy) + else: # 00-89 maps to 2000-2089 + year = str(2000 + yy) else: # Fallback to current year for older formats year = str(datetime.now().year) diff --git a/tests/test_arxiv_year_fix.py b/tests/test_arxiv_year_fix.py new file mode 100644 index 0000000..9503648 --- /dev/null +++ b/tests/test_arxiv_year_fix.py @@ -0,0 +1,139 @@ +"""Test for arXiv year extraction bug fix.""" + +import shutil +from pathlib import Path + +import pytest + +from paperlib.config import LibraryPaths +from paperlib.models import SourceType +from paperlib.storage import PaperStorageManager + + +class TestArxivYearFix: + """Test the arXiv year extraction fix.""" + + @pytest.fixture + def temp_library(self): + """Create a temporary library for testing.""" + temp_dir = Path("./.tmp") / f"test_arxiv_year_{hash(self)}" + temp_dir.mkdir(parents=True, exist_ok=True) + library_paths = LibraryPaths.from_root(temp_dir) + library_paths.create_directories() + + yield library_paths + + # Cleanup + if temp_dir.exists(): + shutil.rmtree(temp_dir) + + @pytest.fixture + def storage_manager(self, temp_library): + """Create a storage manager for testing.""" + return PaperStorageManager(temp_library) + + def test_arxiv_year_extraction_2022(self, storage_manager): + """Test year extraction for 2022 paper (2212.06340).""" + paper_dir = storage_manager.get_paper_directory( + "arxiv-2212_06340", SourceType.ARXIV + ) + + # Should extract year 2022 from 2212.06340 + expected = ( + storage_manager.library_paths.papers_dir + / "arxiv" + / "2022" + / "arxiv-2212_06340" + ) + assert paper_dir == expected + + def test_arxiv_year_extraction_2023(self, storage_manager): + """Test year extraction for 2023 paper (2301.12345).""" + paper_dir = storage_manager.get_paper_directory( + "arxiv-2301_12345", SourceType.ARXIV + ) + + # Should extract year 2023 from 2301.12345 + expected = ( + storage_manager.library_paths.papers_dir + / "arxiv" + / "2023" + / "arxiv-2301_12345" + ) + assert paper_dir == expected + + def test_arxiv_year_extraction_2020(self, storage_manager): + """Test year extraction for 2020 paper (2005.67890).""" + paper_dir = storage_manager.get_paper_directory( + "arxiv-2005_67890", SourceType.ARXIV + ) + + # Should extract year 2020 from 2005.67890 + expected = ( + storage_manager.library_paths.papers_dir + / "arxiv" + / "2020" + / "arxiv-2005_67890" + ) + assert paper_dir == expected + + def test_arxiv_year_extraction_1999(self, storage_manager): + """Test year extraction for 1999 paper (9912.12345).""" + paper_dir = storage_manager.get_paper_directory( + "arxiv-9912_12345", SourceType.ARXIV + ) + + # Should extract year 1999 from 9912.12345 (99 -> 1999) + expected = ( + storage_manager.library_paths.papers_dir + / "arxiv" + / "1999" + / "arxiv-9912_12345" + ) + assert paper_dir == expected + + def test_arxiv_year_extraction_2000(self, storage_manager): + """Test year extraction for 2000 paper (0001.12345).""" + paper_dir = storage_manager.get_paper_directory( + "arxiv-0001_12345", SourceType.ARXIV + ) + + # Should extract year 2000 from 0001.12345 (00 -> 2000) + expected = ( + storage_manager.library_paths.papers_dir + / "arxiv" + / "2000" + / "arxiv-0001_12345" + ) + assert paper_dir == expected + + def test_arxiv_id_with_version(self, storage_manager): + """Test year extraction with version number.""" + paper_dir = storage_manager.get_paper_directory( + "arxiv-2212_06340v1", SourceType.ARXIV + ) + + # Should extract year 2022 from 2212.06340v1 + expected = ( + storage_manager.library_paths.papers_dir + / "arxiv" + / "2022" + / "arxiv-2212_06340v1" + ) + assert paper_dir == expected + + def test_existing_storage_test_still_passes(self, storage_manager): + """Ensure we didn't break the existing test case.""" + # This matches the test case in test_storage.py + paper_dir = storage_manager.get_paper_directory( + "arxiv-2212_06340", SourceType.ARXIV + ) + + # The old test expected papers/arxiv/2212/ but should now be papers/arxiv/2022/ + expected = ( + storage_manager.library_paths.papers_dir + / "arxiv" + / "2022" + / "arxiv-2212_06340" + ) + assert paper_dir == expected diff --git a/tests/test_storage.py b/tests/test_storage.py index 647534e..f98e33d 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -72,10 +72,11 @@ class TestPaperStorageManager: "arxiv-2212_06340", SourceType.ARXIV ) + # Should extract year 2022 from 2212.06340 (22 -> 2022) expected = ( storage_manager.library_paths.papers_dir / "arxiv" - / "2212" + / "2022" / "arxiv-2212_06340" ) assert paper_dir == expected