fix: arxiv year
This commit is contained in:
@@ -53,16 +53,21 @@ papers/arxiv/YEAR/arxiv-NORMALIZED_ID/
|
|||||||
```
|
```
|
||||||
|
|
||||||
Where:
|
Where:
|
||||||
- `YEAR` is extracted from the arXiv ID (e.g., `2212.06340` → `2022`)
|
- `YEAR` is extracted from the arXiv ID (e.g., `2212.06340` → `2022`, `0001.12345` → `2000`)
|
||||||
- `NORMALIZED_ID` replaces dots and version numbers with underscores
|
- `NORMALIZED_ID` replaces dots and version numbers with underscores
|
||||||
- `2212.06340` → `arxiv-2212_06340`
|
- `2212.06340` → `arxiv-2212_06340`
|
||||||
- `2212.06340v2` → `arxiv-2212_06340v2`
|
- `2212.06340v2` → `arxiv-2212_06340v2`
|
||||||
|
|
||||||
|
The year extraction follows arXiv's YYMM.NNNNN format:
|
||||||
|
- Years 00-89 map to 2000-2089
|
||||||
|
- Years 90-99 map to 1990-1999
|
||||||
|
|
||||||
**Examples:**
|
**Examples:**
|
||||||
```
|
```
|
||||||
papers/arxiv/2022/arxiv-2212_06340/
|
papers/arxiv/2022/arxiv-2212_06340/ # 2212.06340 -> year 2022
|
||||||
papers/arxiv/2023/arxiv-2301_12345v1/
|
papers/arxiv/2023/arxiv-2301_12345v1/ # 2301.12345v1 -> year 2023
|
||||||
papers/arxiv/2024/arxiv-2405_98765/
|
papers/arxiv/2000/arxiv-0001_98765/ # 0001.98765 -> year 2000
|
||||||
|
papers/arxiv/1999/arxiv-9912_12345/ # 9912.12345 -> year 1999
|
||||||
```
|
```
|
||||||
|
|
||||||
### Local Papers
|
### Local Papers
|
||||||
|
|||||||
@@ -43,10 +43,15 @@ class PaperStorageManager:
|
|||||||
if source_type == SourceType.ARXIV:
|
if source_type == SourceType.ARXIV:
|
||||||
# Extract year from arXiv ID pattern (e.g., "2212.06340" -> "2022")
|
# Extract year from arXiv ID pattern (e.g., "2212.06340" -> "2022")
|
||||||
arxiv_id = paper_id.replace("arxiv-", "").replace("_", ".")
|
arxiv_id = paper_id.replace("arxiv-", "").replace("_", ".")
|
||||||
year_part = arxiv_id[:4]
|
year_part = arxiv_id[:2] # Get YY part
|
||||||
# Modern arXiv format: YYMM.NNNNN
|
# Modern arXiv format: YYMM.NNNNN
|
||||||
if len(year_part) == 4 and year_part.isdigit():
|
if len(year_part) == 2 and year_part.isdigit():
|
||||||
year = year_part
|
# Convert 2-digit year to 4-digit year
|
||||||
|
yy = int(year_part)
|
||||||
|
if yy >= 90: # 90-99 maps to 1990-1999
|
||||||
|
year = str(1900 + yy)
|
||||||
|
else: # 00-89 maps to 2000-2089
|
||||||
|
year = str(2000 + yy)
|
||||||
else:
|
else:
|
||||||
# Fallback to current year for older formats
|
# Fallback to current year for older formats
|
||||||
year = str(datetime.now().year)
|
year = str(datetime.now().year)
|
||||||
|
|||||||
@@ -0,0 +1,139 @@
|
|||||||
|
"""Test for arXiv year extraction bug fix."""
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from paperlib.config import LibraryPaths
|
||||||
|
from paperlib.models import SourceType
|
||||||
|
from paperlib.storage import PaperStorageManager
|
||||||
|
|
||||||
|
|
||||||
|
class TestArxivYearFix:
|
||||||
|
"""Test the arXiv year extraction fix."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def temp_library(self):
|
||||||
|
"""Create a temporary library for testing."""
|
||||||
|
temp_dir = Path("./.tmp") / f"test_arxiv_year_{hash(self)}"
|
||||||
|
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
library_paths = LibraryPaths.from_root(temp_dir)
|
||||||
|
library_paths.create_directories()
|
||||||
|
|
||||||
|
yield library_paths
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
if temp_dir.exists():
|
||||||
|
shutil.rmtree(temp_dir)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def storage_manager(self, temp_library):
|
||||||
|
"""Create a storage manager for testing."""
|
||||||
|
return PaperStorageManager(temp_library)
|
||||||
|
|
||||||
|
def test_arxiv_year_extraction_2022(self, storage_manager):
|
||||||
|
"""Test year extraction for 2022 paper (2212.06340)."""
|
||||||
|
paper_dir = storage_manager.get_paper_directory(
|
||||||
|
"arxiv-2212_06340", SourceType.ARXIV
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should extract year 2022 from 2212.06340
|
||||||
|
expected = (
|
||||||
|
storage_manager.library_paths.papers_dir
|
||||||
|
/ "arxiv"
|
||||||
|
/ "2022"
|
||||||
|
/ "arxiv-2212_06340"
|
||||||
|
)
|
||||||
|
assert paper_dir == expected
|
||||||
|
|
||||||
|
def test_arxiv_year_extraction_2023(self, storage_manager):
|
||||||
|
"""Test year extraction for 2023 paper (2301.12345)."""
|
||||||
|
paper_dir = storage_manager.get_paper_directory(
|
||||||
|
"arxiv-2301_12345", SourceType.ARXIV
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should extract year 2023 from 2301.12345
|
||||||
|
expected = (
|
||||||
|
storage_manager.library_paths.papers_dir
|
||||||
|
/ "arxiv"
|
||||||
|
/ "2023"
|
||||||
|
/ "arxiv-2301_12345"
|
||||||
|
)
|
||||||
|
assert paper_dir == expected
|
||||||
|
|
||||||
|
def test_arxiv_year_extraction_2020(self, storage_manager):
|
||||||
|
"""Test year extraction for 2020 paper (2005.67890)."""
|
||||||
|
paper_dir = storage_manager.get_paper_directory(
|
||||||
|
"arxiv-2005_67890", SourceType.ARXIV
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should extract year 2020 from 2005.67890
|
||||||
|
expected = (
|
||||||
|
storage_manager.library_paths.papers_dir
|
||||||
|
/ "arxiv"
|
||||||
|
/ "2020"
|
||||||
|
/ "arxiv-2005_67890"
|
||||||
|
)
|
||||||
|
assert paper_dir == expected
|
||||||
|
|
||||||
|
def test_arxiv_year_extraction_1999(self, storage_manager):
|
||||||
|
"""Test year extraction for 1999 paper (9912.12345)."""
|
||||||
|
paper_dir = storage_manager.get_paper_directory(
|
||||||
|
"arxiv-9912_12345", SourceType.ARXIV
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should extract year 1999 from 9912.12345 (99 -> 1999)
|
||||||
|
expected = (
|
||||||
|
storage_manager.library_paths.papers_dir
|
||||||
|
/ "arxiv"
|
||||||
|
/ "1999"
|
||||||
|
/ "arxiv-9912_12345"
|
||||||
|
)
|
||||||
|
assert paper_dir == expected
|
||||||
|
|
||||||
|
def test_arxiv_year_extraction_2000(self, storage_manager):
|
||||||
|
"""Test year extraction for 2000 paper (0001.12345)."""
|
||||||
|
paper_dir = storage_manager.get_paper_directory(
|
||||||
|
"arxiv-0001_12345", SourceType.ARXIV
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should extract year 2000 from 0001.12345 (00 -> 2000)
|
||||||
|
expected = (
|
||||||
|
storage_manager.library_paths.papers_dir
|
||||||
|
/ "arxiv"
|
||||||
|
/ "2000"
|
||||||
|
/ "arxiv-0001_12345"
|
||||||
|
)
|
||||||
|
assert paper_dir == expected
|
||||||
|
|
||||||
|
def test_arxiv_id_with_version(self, storage_manager):
|
||||||
|
"""Test year extraction with version number."""
|
||||||
|
paper_dir = storage_manager.get_paper_directory(
|
||||||
|
"arxiv-2212_06340v1", SourceType.ARXIV
|
||||||
|
)
|
||||||
|
|
||||||
|
# Should extract year 2022 from 2212.06340v1
|
||||||
|
expected = (
|
||||||
|
storage_manager.library_paths.papers_dir
|
||||||
|
/ "arxiv"
|
||||||
|
/ "2022"
|
||||||
|
/ "arxiv-2212_06340v1"
|
||||||
|
)
|
||||||
|
assert paper_dir == expected
|
||||||
|
|
||||||
|
def test_existing_storage_test_still_passes(self, storage_manager):
|
||||||
|
"""Ensure we didn't break the existing test case."""
|
||||||
|
# This matches the test case in test_storage.py
|
||||||
|
paper_dir = storage_manager.get_paper_directory(
|
||||||
|
"arxiv-2212_06340", SourceType.ARXIV
|
||||||
|
)
|
||||||
|
|
||||||
|
# The old test expected papers/arxiv/2212/ but should now be papers/arxiv/2022/
|
||||||
|
expected = (
|
||||||
|
storage_manager.library_paths.papers_dir
|
||||||
|
/ "arxiv"
|
||||||
|
/ "2022"
|
||||||
|
/ "arxiv-2212_06340"
|
||||||
|
)
|
||||||
|
assert paper_dir == expected
|
||||||
@@ -72,10 +72,11 @@ class TestPaperStorageManager:
|
|||||||
"arxiv-2212_06340", SourceType.ARXIV
|
"arxiv-2212_06340", SourceType.ARXIV
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Should extract year 2022 from 2212.06340 (22 -> 2022)
|
||||||
expected = (
|
expected = (
|
||||||
storage_manager.library_paths.papers_dir
|
storage_manager.library_paths.papers_dir
|
||||||
/ "arxiv"
|
/ "arxiv"
|
||||||
/ "2212"
|
/ "2022"
|
||||||
/ "arxiv-2212_06340"
|
/ "arxiv-2212_06340"
|
||||||
)
|
)
|
||||||
assert paper_dir == expected
|
assert paper_dir == expected
|
||||||
|
|||||||
Reference in New Issue
Block a user