update: add core functionality
This commit is contained in:
@@ -5,6 +5,7 @@ description = "Local-first CLI toolkit for managing a paper library"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.13,<3.14"
|
||||
dependencies = [
|
||||
"arxiv>=2.0.0",
|
||||
"mineru[core]>=3.0.9",
|
||||
"rich>=15.0.0",
|
||||
"typer>=0.24.1",
|
||||
|
||||
@@ -2,6 +2,5 @@
|
||||
|
||||
from paperlib.cli import main
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
+256
-6
@@ -7,6 +7,10 @@ from pathlib import Path
|
||||
|
||||
from paperlib import __version__
|
||||
from paperlib.config import LibraryPaths
|
||||
from paperlib.converter import MinerUConverter
|
||||
from paperlib.importer import ArxivImporter, LocalImporter
|
||||
from paperlib.index import DatabaseManager
|
||||
from paperlib.storage import PaperStorageManager
|
||||
|
||||
|
||||
def _resolve_library_root(path: Path | None) -> Path:
|
||||
@@ -52,12 +56,15 @@ def _build_parser() -> argparse.ArgumentParser:
|
||||
status_parser.set_defaults(handler=_handle_status)
|
||||
|
||||
list_parser = subparsers.add_parser("list", help="List imported papers.")
|
||||
list_parser.add_argument("--library", "-L", default=".", help="Library root")
|
||||
list_parser.set_defaults(handler=_handle_list)
|
||||
|
||||
show_parser = subparsers.add_parser(
|
||||
"show",
|
||||
help="Show detailed information for a paper.",
|
||||
)
|
||||
show_parser.add_argument("paper_id", help="Paper ID to show")
|
||||
show_parser.add_argument("--library", "-L", default=".", help="Library root")
|
||||
show_parser.set_defaults(handler=_handle_show)
|
||||
|
||||
search_parser = subparsers.add_parser(
|
||||
@@ -66,6 +73,37 @@ def _build_parser() -> argparse.ArgumentParser:
|
||||
)
|
||||
search_parser.set_defaults(handler=_handle_search)
|
||||
|
||||
# Import command
|
||||
import_parser = subparsers.add_parser(
|
||||
"import",
|
||||
help="Import a paper into the library.",
|
||||
)
|
||||
import_group = import_parser.add_mutually_exclusive_group(required=True)
|
||||
import_group.add_argument("--pdf", type=Path, help="Path to a local PDF file")
|
||||
import_group.add_argument("--arxiv", type=str, help="arXiv ID or URL")
|
||||
import_parser.add_argument("--title", type=str, help="Title for local PDFs")
|
||||
import_parser.add_argument("--notes", type=str, default="", help="Notes")
|
||||
import_parser.add_argument("--tags", nargs="*", default=[], help="Tags")
|
||||
import_parser.add_argument("--library", "-L", default=".", help="Library root")
|
||||
import_parser.set_defaults(handler=_handle_import)
|
||||
|
||||
# Convert command
|
||||
convert_parser = subparsers.add_parser(
|
||||
"convert",
|
||||
help="Convert papers to Markdown.",
|
||||
)
|
||||
convert_parser.add_argument("--library", "-L", default=".", help="Library root")
|
||||
convert_parser.add_argument("--paper-id", help="Convert specific paper by ID")
|
||||
convert_parser.set_defaults(handler=_handle_convert)
|
||||
|
||||
# Reindex command
|
||||
reindex_parser = subparsers.add_parser(
|
||||
"reindex",
|
||||
help="Rebuild the search index from stored papers.",
|
||||
)
|
||||
reindex_parser.add_argument("--library", "-L", default=".", help="Library root")
|
||||
reindex_parser.set_defaults(handler=_handle_reindex)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
@@ -98,17 +136,117 @@ def _handle_status(args: argparse.Namespace) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def _handle_list(_: argparse.Namespace) -> int:
|
||||
"""Placeholder for listing imported papers."""
|
||||
print("Listing papers is not implemented yet.")
|
||||
def _handle_list(args: argparse.Namespace) -> int:
|
||||
"""List imported papers."""
|
||||
try:
|
||||
paths = LibraryPaths.from_root(
|
||||
_resolve_library_root(
|
||||
Path(args.library if hasattr(args, "library") else ".")
|
||||
)
|
||||
)
|
||||
storage_manager = PaperStorageManager(paths)
|
||||
db_manager = DatabaseManager(paths)
|
||||
|
||||
# Initialize database if it doesn't exist
|
||||
db_manager.initialize_database()
|
||||
|
||||
# List all papers from storage (more reliable than index)
|
||||
papers = list(storage_manager.list_all_papers())
|
||||
|
||||
if not papers:
|
||||
print("No papers found in library.")
|
||||
return 0
|
||||
|
||||
print(f"Found {len(papers)} papers:")
|
||||
print()
|
||||
|
||||
for metadata in papers:
|
||||
status_indicators = []
|
||||
if metadata.conversion_status.value == "success":
|
||||
status_indicators.append("📄") # Converted
|
||||
if metadata.summary_status.value == "success":
|
||||
status_indicators.append("📝") # Summarized
|
||||
|
||||
status_str = "".join(status_indicators) if status_indicators else "⏳"
|
||||
|
||||
print(f"{status_str} {metadata.paper_id}")
|
||||
print(f" {metadata.title}")
|
||||
if metadata.authors:
|
||||
authors_str = ", ".join(metadata.authors[:3])
|
||||
if len(metadata.authors) > 3:
|
||||
authors_str += f" (+{len(metadata.authors) - 3} more)"
|
||||
print(f" By: {authors_str}")
|
||||
if metadata.categories:
|
||||
print(f" Categories: {', '.join(metadata.categories)}")
|
||||
print()
|
||||
|
||||
def _handle_show(_: argparse.Namespace) -> int:
|
||||
"""Placeholder for showing paper details."""
|
||||
print("Showing paper details is not implemented yet.")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error listing papers: {e}")
|
||||
return 1
|
||||
|
||||
|
||||
def _handle_show(args: argparse.Namespace) -> int:
|
||||
"""Show detailed information for a paper."""
|
||||
if not hasattr(args, "paper_id") or not args.paper_id:
|
||||
print("Please specify a paper ID with --paper-id")
|
||||
return 1
|
||||
|
||||
try:
|
||||
paths = LibraryPaths.from_root(
|
||||
_resolve_library_root(
|
||||
Path(args.library if hasattr(args, "library") else ".")
|
||||
)
|
||||
)
|
||||
storage_manager = PaperStorageManager(paths)
|
||||
|
||||
# Find paper by ID
|
||||
for metadata in storage_manager.list_all_papers():
|
||||
if metadata.paper_id == args.paper_id:
|
||||
print(f"Paper ID: {metadata.paper_id}")
|
||||
print(f"Source: {metadata.source_type.value}")
|
||||
if metadata.source_id:
|
||||
print(f"Source ID: {metadata.source_id}")
|
||||
print(f"Title: {metadata.title}")
|
||||
if metadata.authors:
|
||||
print(f"Authors: {', '.join(metadata.authors)}")
|
||||
if metadata.published_date:
|
||||
print(f"Published: {metadata.published_date.strftime('%Y-%m-%d')}")
|
||||
if metadata.categories:
|
||||
print(f"Categories: {', '.join(metadata.categories)}")
|
||||
if metadata.tags:
|
||||
print(f"Tags: {', '.join(metadata.tags)}")
|
||||
print(f"Imported: {metadata.imported_at.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Conversion Status: {metadata.conversion_status.value}")
|
||||
print(f"Summary Status: {metadata.summary_status.value}")
|
||||
if metadata.notes:
|
||||
print(f"Notes: {metadata.notes}")
|
||||
|
||||
# Show file paths
|
||||
print("\nFiles:")
|
||||
if metadata.pdf_path:
|
||||
pdf_path = paths.root / metadata.pdf_path
|
||||
exists = "✓" if pdf_path.exists() else "✗"
|
||||
print(f" PDF: {exists} {metadata.pdf_path}")
|
||||
if metadata.paper_md_path:
|
||||
md_path = paths.root / metadata.paper_md_path
|
||||
exists = "✓" if md_path.exists() else "✗"
|
||||
print(f" Markdown: {exists} {metadata.paper_md_path}")
|
||||
if metadata.summary_json_path:
|
||||
summary_path = paths.root / metadata.summary_json_path
|
||||
exists = "✓" if summary_path.exists() else "✗"
|
||||
print(f" Summary: {exists} {metadata.summary_json_path}")
|
||||
|
||||
return 0
|
||||
|
||||
print(f"Paper not found: {args.paper_id}")
|
||||
return 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error showing paper: {e}")
|
||||
return 1
|
||||
|
||||
|
||||
def _handle_search(_: argparse.Namespace) -> int:
|
||||
"""Placeholder for searching the paper library."""
|
||||
@@ -116,6 +254,118 @@ def _handle_search(_: argparse.Namespace) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
def _handle_import(args: argparse.Namespace) -> int:
|
||||
"""Handle importing a paper into the library."""
|
||||
try:
|
||||
# Set up library paths and managers
|
||||
paths = LibraryPaths.from_root(_resolve_library_root(Path(args.library)))
|
||||
storage_manager = PaperStorageManager(paths)
|
||||
db_manager = DatabaseManager(paths)
|
||||
|
||||
# Initialize database
|
||||
db_manager.initialize_database()
|
||||
|
||||
if args.pdf:
|
||||
# Import local PDF
|
||||
local_importer = LocalImporter(storage_manager)
|
||||
metadata = local_importer.import_pdf(
|
||||
pdf_path=args.pdf,
|
||||
title=args.title or "",
|
||||
notes=args.notes,
|
||||
tags=args.tags,
|
||||
)
|
||||
# Index the paper
|
||||
db_manager.index_paper(metadata)
|
||||
|
||||
print(f"Successfully imported local PDF: {metadata.paper_id}")
|
||||
print(f"Title: {metadata.title}")
|
||||
|
||||
elif args.arxiv:
|
||||
# Import from arXiv
|
||||
arxiv_importer = ArxivImporter(storage_manager)
|
||||
metadata = arxiv_importer.import_arxiv_paper(
|
||||
arxiv_input=args.arxiv,
|
||||
notes=args.notes,
|
||||
tags=args.tags,
|
||||
)
|
||||
# Index the paper
|
||||
db_manager.index_paper(metadata)
|
||||
|
||||
print(f"Successfully imported arXiv paper: {metadata.paper_id}")
|
||||
print(f"Title: {metadata.title}")
|
||||
print(f"Authors: {', '.join(metadata.authors)}")
|
||||
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error importing paper: {e}")
|
||||
return 1
|
||||
|
||||
|
||||
def _handle_convert(args: argparse.Namespace) -> int:
|
||||
"""Handle converting papers to Markdown."""
|
||||
try:
|
||||
# Set up library paths and components
|
||||
paths = LibraryPaths.from_root(_resolve_library_root(Path(args.library)))
|
||||
storage_manager = PaperStorageManager(paths)
|
||||
converter = MinerUConverter(storage_manager)
|
||||
|
||||
if args.paper_id:
|
||||
# Convert specific paper
|
||||
for metadata in storage_manager.list_all_papers():
|
||||
if metadata.paper_id == args.paper_id:
|
||||
if converter.convert_paper(metadata):
|
||||
print(f"Successfully converted paper: {metadata.paper_id}")
|
||||
else:
|
||||
print(f"Failed to convert paper: {metadata.paper_id}")
|
||||
return 0
|
||||
print(f"Paper not found: {args.paper_id}")
|
||||
return 1
|
||||
else:
|
||||
# Convert all pending papers
|
||||
success_count, failure_count = converter.convert_all_pending()
|
||||
msg = f"Complete: {success_count} successful, {failure_count} failed"
|
||||
print(msg)
|
||||
return 0 if failure_count == 0 else 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during conversion: {e}")
|
||||
return 1
|
||||
|
||||
|
||||
def _handle_reindex(args: argparse.Namespace) -> int:
|
||||
"""Rebuild the search index from stored papers."""
|
||||
try:
|
||||
paths = LibraryPaths.from_root(_resolve_library_root(Path(args.library)))
|
||||
storage_manager = PaperStorageManager(paths)
|
||||
db_manager = DatabaseManager(paths)
|
||||
|
||||
print("Rebuilding search index...")
|
||||
|
||||
# Initialize database schema
|
||||
db_manager.initialize_database()
|
||||
|
||||
# Rebuild index from storage
|
||||
success_count, error_count = db_manager.reindex_from_storage(storage_manager)
|
||||
|
||||
print(f"Reindex complete: {success_count} papers indexed, {error_count} errors")
|
||||
|
||||
# Show statistics
|
||||
stats = db_manager.get_statistics()
|
||||
print(f"Total papers: {stats['total_papers']}")
|
||||
if stats.get("by_source_type"):
|
||||
by_source = ", ".join(
|
||||
f"{k}: {v}" for k, v in stats["by_source_type"].items()
|
||||
)
|
||||
print(f"By source: {by_source}")
|
||||
|
||||
return 0 if error_count == 0 else 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during reindex: {e}")
|
||||
return 1
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Console script entrypoint."""
|
||||
parser = _build_parser()
|
||||
|
||||
@@ -5,7 +5,6 @@ from __future__ import annotations
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
DEFAULT_CONFIG_DIRNAME = "config"
|
||||
DEFAULT_DB_DIRNAME = "db"
|
||||
DEFAULT_CACHE_DIRNAME = "cache"
|
||||
@@ -29,7 +28,7 @@ class LibraryPaths:
|
||||
config_path: Path
|
||||
|
||||
@classmethod
|
||||
def from_root(cls, root: Path) -> "LibraryPaths":
|
||||
def from_root(cls, root: Path) -> LibraryPaths:
|
||||
"""Build a standard library layout from a root directory."""
|
||||
resolved_root = root.expanduser().resolve()
|
||||
config_dir = resolved_root / DEFAULT_CONFIG_DIRNAME
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
"""PDF conversion functionality for paperlib."""
|
||||
|
||||
from .mineru_converter import MinerUConverter
|
||||
|
||||
__all__ = ["MinerUConverter"]
|
||||
@@ -0,0 +1,134 @@
|
||||
"""PDF to Markdown conversion using MinerU."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from paperlib.models import ConversionStatus, PaperMetadata
|
||||
from paperlib.storage import PaperStorageManager
|
||||
|
||||
|
||||
class MinerUConverter:
|
||||
"""Handles PDF to Markdown conversion using MinerU."""
|
||||
|
||||
def __init__(self, storage_manager: PaperStorageManager) -> None:
|
||||
self.storage_manager = storage_manager
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def is_mineru_available(self) -> bool:
|
||||
"""Check if MinerU is available in the environment."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[sys.executable, "-c", "import mineru"],
|
||||
capture_output=True,
|
||||
check=False,
|
||||
)
|
||||
return result.returncode == 0
|
||||
except (subprocess.SubprocessError, FileNotFoundError):
|
||||
return False
|
||||
|
||||
def convert_paper(self, metadata: PaperMetadata) -> bool:
|
||||
"""Convert a paper's PDF to Markdown using MinerU."""
|
||||
if not self.is_mineru_available():
|
||||
self.logger.error("MinerU is not available")
|
||||
return False
|
||||
|
||||
# Get paper paths
|
||||
paths = self.storage_manager.get_paper_paths(
|
||||
metadata.paper_id, metadata.source_type
|
||||
)
|
||||
pdf_path = self.storage_manager.library_paths.root / metadata.pdf_path
|
||||
markdown_path = paths["markdown"]
|
||||
logs_dir = paths["logs"]
|
||||
|
||||
if not pdf_path.exists():
|
||||
self.logger.error(f"PDF file not found: {pdf_path}")
|
||||
return False
|
||||
|
||||
# Update status to processing
|
||||
metadata.conversion_status = ConversionStatus.PROCESSING
|
||||
self.storage_manager.update_paper_metadata(metadata)
|
||||
|
||||
try:
|
||||
# Run MinerU conversion
|
||||
log_file = logs_dir / "mineru.log"
|
||||
|
||||
# MinerU command
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"magic_pdf.pipe.UNIPipe",
|
||||
"--pdf",
|
||||
str(pdf_path),
|
||||
"--output-dir",
|
||||
str(paths["directory"]),
|
||||
]
|
||||
|
||||
self.logger.info(f"Running MinerU conversion: {' '.join(cmd)}")
|
||||
|
||||
with log_file.open("w") as log:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
stdout=log,
|
||||
stderr=subprocess.STDOUT,
|
||||
cwd=paths["directory"],
|
||||
check=False,
|
||||
)
|
||||
|
||||
# Check if conversion was successful
|
||||
if result.returncode == 0:
|
||||
# MinerU typically outputs markdown files, try to find the main one
|
||||
# Look for common output patterns
|
||||
markdown_candidates = list(paths["directory"].glob("*.md"))
|
||||
if not markdown_candidates:
|
||||
# Try subdirectories
|
||||
markdown_candidates = list(paths["directory"].rglob("*.md"))
|
||||
|
||||
if markdown_candidates:
|
||||
# Use the first markdown file found, or rename if needed
|
||||
main_md = markdown_candidates[0]
|
||||
if main_md != markdown_path:
|
||||
main_md.rename(markdown_path)
|
||||
|
||||
# Update metadata
|
||||
metadata.conversion_status = ConversionStatus.SUCCESS
|
||||
self.storage_manager.update_paper_metadata(metadata)
|
||||
|
||||
self.logger.info(
|
||||
f"Successfully converted {pdf_path} to {markdown_path}"
|
||||
)
|
||||
return True
|
||||
else:
|
||||
self.logger.error("No markdown output found after conversion")
|
||||
metadata.conversion_status = ConversionStatus.FAILED
|
||||
self.storage_manager.update_paper_metadata(metadata)
|
||||
return False
|
||||
else:
|
||||
self.logger.error(
|
||||
f"MinerU conversion failed with return code {result.returncode}"
|
||||
)
|
||||
metadata.conversion_status = ConversionStatus.FAILED
|
||||
self.storage_manager.update_paper_metadata(metadata)
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Exception during conversion: {e}")
|
||||
metadata.conversion_status = ConversionStatus.FAILED
|
||||
self.storage_manager.update_paper_metadata(metadata)
|
||||
return False
|
||||
|
||||
def convert_all_pending(self) -> tuple[int, int]:
|
||||
"""Convert all papers with pending conversion status."""
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
||||
for metadata in self.storage_manager.list_all_papers():
|
||||
if metadata.conversion_status == ConversionStatus.PENDING:
|
||||
if self.convert_paper(metadata):
|
||||
success_count += 1
|
||||
else:
|
||||
failure_count += 1
|
||||
|
||||
return success_count, failure_count
|
||||
@@ -0,0 +1,6 @@
|
||||
"""Import functionality for paperlib."""
|
||||
|
||||
from .arxiv_importer import ArxivImporter
|
||||
from .local_importer import LocalImporter
|
||||
|
||||
__all__ = ["ArxivImporter", "LocalImporter"]
|
||||
@@ -0,0 +1,112 @@
|
||||
"""arXiv import functionality."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import arxiv
|
||||
|
||||
from paperlib.models import PaperMetadata, SourceType
|
||||
from paperlib.storage import PaperStorageManager
|
||||
|
||||
|
||||
class ArxivImporter:
|
||||
"""Handles importing papers from arXiv."""
|
||||
|
||||
def __init__(self, storage_manager: PaperStorageManager) -> None:
|
||||
self.storage_manager = storage_manager
|
||||
# Create arXiv client with reasonable defaults
|
||||
self.client = arxiv.Client(page_size=10, delay_seconds=3.0, num_retries=3)
|
||||
|
||||
def extract_arxiv_id(self, input_string: str) -> str:
|
||||
"""Extract arXiv ID from various input formats."""
|
||||
# Clean input
|
||||
input_string = input_string.strip()
|
||||
|
||||
# Pattern for arXiv ID (both old and new formats)
|
||||
# New format: YYMM.NNNNN[vN]
|
||||
# Old format: subject-class/YYMMnnn
|
||||
patterns = [
|
||||
r"(?:arxiv:)?(\d{4}\.\d{4,5}(?:v\d+)?)", # New format
|
||||
r"(?:arxiv:)?([a-z-]+/\d{7})", # Old format
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, input_string, re.IGNORECASE)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
# If no pattern matches, assume it's already a clean arXiv ID
|
||||
return input_string
|
||||
|
||||
def fetch_paper_metadata(self, arxiv_id: str) -> arxiv.Result:
|
||||
"""Fetch paper metadata from arXiv API."""
|
||||
search = arxiv.Search(id_list=[arxiv_id])
|
||||
|
||||
results = list(self.client.results(search))
|
||||
if not results:
|
||||
msg = f"Paper not found on arXiv: {arxiv_id}"
|
||||
raise ValueError(msg)
|
||||
|
||||
return results[0]
|
||||
|
||||
def download_pdf(self, result: arxiv.Result) -> Path:
|
||||
"""Download PDF from arXiv to a temporary location."""
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
|
||||
tmp_path = Path(tmp_file.name)
|
||||
|
||||
# Download PDF
|
||||
result.download_pdf(filename=str(tmp_path))
|
||||
|
||||
return tmp_path
|
||||
|
||||
def import_arxiv_paper(
|
||||
self, arxiv_input: str, notes: str = "", tags: list[str] | None = None
|
||||
) -> PaperMetadata:
|
||||
"""Import a paper from arXiv."""
|
||||
# Extract clean arXiv ID
|
||||
arxiv_id = self.extract_arxiv_id(arxiv_input)
|
||||
|
||||
# Check if already imported
|
||||
paper_id = self.storage_manager.generate_paper_id(SourceType.ARXIV, arxiv_id)
|
||||
if self.storage_manager.paper_exists(paper_id, SourceType.ARXIV):
|
||||
msg = f"Paper already imported: {arxiv_id}"
|
||||
raise ValueError(msg)
|
||||
|
||||
# Fetch metadata from arXiv
|
||||
result = self.fetch_paper_metadata(arxiv_id)
|
||||
|
||||
# Download PDF
|
||||
pdf_path = self.download_pdf(result)
|
||||
|
||||
try:
|
||||
# Convert arXiv result to our metadata format
|
||||
published_date = (
|
||||
result.published.replace(tzinfo=None) if result.published else None
|
||||
)
|
||||
updated_date = (
|
||||
result.updated.replace(tzinfo=None) if result.updated else None
|
||||
)
|
||||
|
||||
# Store the paper
|
||||
metadata = self.storage_manager.store_paper(
|
||||
pdf_path=pdf_path,
|
||||
source_type=SourceType.ARXIV,
|
||||
source_id=arxiv_id,
|
||||
title=result.title,
|
||||
authors=[author.name for author in result.authors],
|
||||
published_date=published_date,
|
||||
updated_date=updated_date,
|
||||
categories=[cat for cat in result.categories],
|
||||
notes=notes,
|
||||
tags=tags or [],
|
||||
)
|
||||
|
||||
return metadata
|
||||
|
||||
finally:
|
||||
# Clean up temporary PDF file
|
||||
if pdf_path.exists():
|
||||
pdf_path.unlink()
|
||||
@@ -0,0 +1,56 @@
|
||||
"""Local PDF import functionality."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from paperlib.models import PaperMetadata, SourceType
|
||||
from paperlib.storage import PaperStorageManager
|
||||
|
||||
|
||||
class LocalImporter:
|
||||
"""Handles importing local PDF files."""
|
||||
|
||||
def __init__(self, storage_manager: PaperStorageManager) -> None:
|
||||
self.storage_manager = storage_manager
|
||||
|
||||
def import_pdf(
|
||||
self,
|
||||
pdf_path: Path,
|
||||
title: str = "",
|
||||
notes: str = "",
|
||||
tags: list[str] | None = None,
|
||||
) -> PaperMetadata:
|
||||
"""Import a local PDF file."""
|
||||
if not pdf_path.exists():
|
||||
msg = f"PDF file not found: {pdf_path}"
|
||||
raise FileNotFoundError(msg)
|
||||
|
||||
if not pdf_path.suffix.lower() == ".pdf":
|
||||
msg = f"File is not a PDF: {pdf_path}"
|
||||
raise ValueError(msg)
|
||||
|
||||
# Generate paper ID and check for duplicates
|
||||
paper_id = self.storage_manager.generate_paper_id(
|
||||
SourceType.LOCAL, pdf_path=pdf_path
|
||||
)
|
||||
|
||||
if self.storage_manager.paper_exists(paper_id, SourceType.LOCAL):
|
||||
msg = f"Paper already imported: {paper_id}"
|
||||
raise ValueError(msg)
|
||||
|
||||
# Extract title from filename if not provided
|
||||
if not title:
|
||||
title = pdf_path.stem.replace("_", " ").replace("-", " ").title()
|
||||
|
||||
# Store the paper
|
||||
metadata = self.storage_manager.store_paper(
|
||||
pdf_path=pdf_path,
|
||||
source_type=SourceType.LOCAL,
|
||||
source_id=None,
|
||||
title=title,
|
||||
notes=notes,
|
||||
tags=tags or [],
|
||||
)
|
||||
|
||||
return metadata
|
||||
@@ -0,0 +1,5 @@
|
||||
"""SQLite index layer for paperlib."""
|
||||
|
||||
from .database import DatabaseManager
|
||||
|
||||
__all__ = ["DatabaseManager"]
|
||||
@@ -0,0 +1,321 @@
|
||||
"""SQLite database manager for indexing papers."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from collections.abc import Iterator
|
||||
|
||||
from paperlib.config import LibraryPaths
|
||||
from paperlib.models import ConversionStatus, PaperMetadata, SourceType, SummaryStatus
|
||||
|
||||
|
||||
class DatabaseManager:
|
||||
"""Manages SQLite database for indexing papers."""
|
||||
|
||||
def __init__(self, library_paths: LibraryPaths) -> None:
|
||||
self.library_paths = library_paths
|
||||
self.db_path = library_paths.db_path
|
||||
|
||||
def _get_connection(self) -> sqlite3.Connection:
|
||||
"""Get a database connection with proper settings."""
|
||||
# Ensure database directory exists
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
conn = sqlite3.connect(self.db_path)
|
||||
conn.row_factory = sqlite3.Row # Enable dict-like access to rows
|
||||
conn.execute("PRAGMA foreign_keys = ON") # Enable foreign keys
|
||||
return conn
|
||||
|
||||
def initialize_database(self) -> None:
|
||||
"""Initialize the database schema."""
|
||||
with self._get_connection() as conn:
|
||||
# Main papers table
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS papers (
|
||||
paper_id TEXT PRIMARY KEY,
|
||||
source_type TEXT NOT NULL,
|
||||
source_id TEXT,
|
||||
title TEXT NOT NULL,
|
||||
authors_json TEXT NOT NULL, -- JSON array of authors
|
||||
published_date TEXT, -- ISO format
|
||||
updated_date TEXT, -- ISO format
|
||||
categories_json TEXT NOT NULL, -- JSON array of categories
|
||||
pdf_path TEXT,
|
||||
paper_md_path TEXT,
|
||||
summary_json_path TEXT,
|
||||
summary_md_path TEXT,
|
||||
imported_at TEXT NOT NULL, -- ISO format
|
||||
conversion_status TEXT NOT NULL,
|
||||
summary_status TEXT NOT NULL,
|
||||
tags_json TEXT NOT NULL, -- JSON array of tags
|
||||
notes TEXT NOT NULL,
|
||||
|
||||
-- Computed fields for search
|
||||
search_text TEXT, -- Full-text search content
|
||||
author_list TEXT, -- Space-separated authors for search
|
||||
category_list TEXT -- Space-separated categories for search
|
||||
)
|
||||
""")
|
||||
|
||||
# Create indexes for common queries
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_papers_source_type ON papers(source_type)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_papers_source_id ON papers(source_id)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_papers_conversion_status "
|
||||
"ON papers(conversion_status)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_papers_summary_status "
|
||||
"ON papers(summary_status)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_papers_imported_at ON papers(imported_at)"
|
||||
)
|
||||
|
||||
# Full-text search virtual table
|
||||
conn.execute("""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS papers_fts USING fts5(
|
||||
paper_id UNINDEXED,
|
||||
title,
|
||||
authors,
|
||||
search_text,
|
||||
categories,
|
||||
tags,
|
||||
notes,
|
||||
content='papers',
|
||||
content_rowid='rowid'
|
||||
)
|
||||
""")
|
||||
|
||||
def index_paper(self, metadata: PaperMetadata) -> None:
|
||||
"""Index a paper in the database."""
|
||||
import json
|
||||
|
||||
with self._get_connection() as conn:
|
||||
# Prepare data for insertion
|
||||
parts = [
|
||||
metadata.title,
|
||||
" ".join(metadata.authors),
|
||||
" ".join(metadata.categories),
|
||||
" ".join(metadata.tags),
|
||||
metadata.notes,
|
||||
]
|
||||
search_text = " ".join(parts)
|
||||
author_list = " ".join(metadata.authors)
|
||||
category_list = " ".join(metadata.categories)
|
||||
|
||||
# Insert or replace in main table
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT OR REPLACE INTO papers (
|
||||
paper_id, source_type, source_id, title, authors_json,
|
||||
published_date, updated_date, categories_json, pdf_path,
|
||||
paper_md_path, summary_json_path, summary_md_path,
|
||||
imported_at, conversion_status, summary_status,
|
||||
tags_json, notes, search_text, author_list, category_list
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
metadata.paper_id,
|
||||
metadata.source_type.value,
|
||||
metadata.source_id,
|
||||
metadata.title,
|
||||
json.dumps(metadata.authors),
|
||||
metadata.published_date.isoformat()
|
||||
if metadata.published_date
|
||||
else None,
|
||||
metadata.updated_date.isoformat()
|
||||
if metadata.updated_date
|
||||
else None,
|
||||
json.dumps(metadata.categories),
|
||||
metadata.pdf_path,
|
||||
metadata.paper_md_path,
|
||||
metadata.summary_json_path,
|
||||
metadata.summary_md_path,
|
||||
metadata.imported_at.isoformat(),
|
||||
metadata.conversion_status.value,
|
||||
metadata.summary_status.value,
|
||||
json.dumps(metadata.tags),
|
||||
metadata.notes,
|
||||
search_text,
|
||||
author_list,
|
||||
category_list,
|
||||
),
|
||||
)
|
||||
|
||||
# Update FTS table
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT OR REPLACE INTO papers_fts (
|
||||
paper_id, title, authors, search_text, categories, tags, notes
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
metadata.paper_id,
|
||||
metadata.title,
|
||||
" ".join(metadata.authors),
|
||||
search_text,
|
||||
" ".join(metadata.categories),
|
||||
" ".join(metadata.tags),
|
||||
metadata.notes,
|
||||
),
|
||||
)
|
||||
|
||||
def remove_paper(self, paper_id: str) -> bool:
|
||||
"""Remove a paper from the index."""
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.execute("DELETE FROM papers WHERE paper_id = ?", (paper_id,))
|
||||
conn.execute("DELETE FROM papers_fts WHERE paper_id = ?", (paper_id,))
|
||||
return cursor.rowcount > 0
|
||||
|
||||
def get_paper(self, paper_id: str) -> dict | None:
|
||||
"""Get a paper by ID from the index."""
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.execute(
|
||||
"SELECT * FROM papers WHERE paper_id = ?", (paper_id,)
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
def list_papers(
|
||||
self,
|
||||
source_type: SourceType | None = None,
|
||||
conversion_status: ConversionStatus | None = None,
|
||||
summary_status: SummaryStatus | None = None,
|
||||
limit: int | None = None,
|
||||
offset: int = 0,
|
||||
) -> Iterator[dict]:
|
||||
"""List papers with optional filtering."""
|
||||
conditions = []
|
||||
params = []
|
||||
|
||||
if source_type:
|
||||
conditions.append("source_type = ?")
|
||||
params.append(source_type.value)
|
||||
|
||||
if conversion_status:
|
||||
conditions.append("conversion_status = ?")
|
||||
params.append(conversion_status.value)
|
||||
|
||||
if summary_status:
|
||||
conditions.append("summary_status = ?")
|
||||
params.append(summary_status.value)
|
||||
|
||||
where_clause = ""
|
||||
if conditions:
|
||||
where_clause = "WHERE " + " AND ".join(conditions)
|
||||
|
||||
query = f"SELECT * FROM papers {where_clause} ORDER BY imported_at DESC"
|
||||
|
||||
if limit:
|
||||
query += " LIMIT ? OFFSET ?"
|
||||
params.extend([limit, offset])
|
||||
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.execute(query, params)
|
||||
for row in cursor:
|
||||
yield dict(row)
|
||||
|
||||
def search_papers(self, query: str, limit: int = 50) -> Iterator[dict]:
|
||||
"""Search papers using full-text search."""
|
||||
with self._get_connection() as conn:
|
||||
# Use FTS for full-text search
|
||||
cursor = conn.execute(
|
||||
"""
|
||||
SELECT papers.* FROM papers_fts
|
||||
JOIN papers ON papers.paper_id = papers_fts.paper_id
|
||||
WHERE papers_fts MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?
|
||||
""",
|
||||
(query, limit),
|
||||
)
|
||||
|
||||
for row in cursor:
|
||||
yield dict(row)
|
||||
|
||||
def search_by_field(
|
||||
self,
|
||||
field: str,
|
||||
value: str,
|
||||
exact_match: bool = False,
|
||||
limit: int = 50,
|
||||
) -> Iterator[dict]:
|
||||
"""Search papers by specific field."""
|
||||
if field not in ["title", "author_list", "category_list", "notes"]:
|
||||
msg = f"Invalid field for search: {field}"
|
||||
raise ValueError(msg)
|
||||
|
||||
if exact_match:
|
||||
where_clause = f"{field} = ?"
|
||||
params = [value]
|
||||
else:
|
||||
where_clause = f"{field} LIKE ?"
|
||||
params = [f"%{value}%"]
|
||||
|
||||
query = f"SELECT * FROM papers WHERE {where_clause} ORDER BY imported_at DESC LIMIT ?"
|
||||
params.append(limit)
|
||||
|
||||
with self._get_connection() as conn:
|
||||
cursor = conn.execute(query, params)
|
||||
for row in cursor:
|
||||
yield dict(row)
|
||||
|
||||
def get_statistics(self) -> dict:
|
||||
"""Get library statistics."""
|
||||
with self._get_connection() as conn:
|
||||
stats = {}
|
||||
|
||||
# Total papers
|
||||
cursor = conn.execute("SELECT COUNT(*) as count FROM papers")
|
||||
stats["total_papers"] = cursor.fetchone()["count"]
|
||||
|
||||
# By source type
|
||||
cursor = conn.execute(
|
||||
"SELECT source_type, COUNT(*) as count FROM papers GROUP BY source_type"
|
||||
)
|
||||
stats["by_source_type"] = {
|
||||
row["source_type"]: row["count"] for row in cursor
|
||||
}
|
||||
|
||||
# By conversion status
|
||||
cursor = conn.execute(
|
||||
"SELECT conversion_status, COUNT(*) as count FROM papers GROUP BY conversion_status"
|
||||
)
|
||||
stats["by_conversion_status"] = {
|
||||
row["conversion_status"]: row["count"] for row in cursor
|
||||
}
|
||||
|
||||
# By summary status
|
||||
cursor = conn.execute(
|
||||
"SELECT summary_status, COUNT(*) as count FROM papers GROUP BY summary_status"
|
||||
)
|
||||
stats["by_summary_status"] = {
|
||||
row["summary_status"]: row["count"] for row in cursor
|
||||
}
|
||||
|
||||
return stats
|
||||
|
||||
def reindex_from_storage(self, storage_manager) -> tuple[int, int]:
|
||||
"""Rebuild the index from storage files."""
|
||||
success_count = 0
|
||||
error_count = 0
|
||||
|
||||
# Clear existing index
|
||||
with self._get_connection() as conn:
|
||||
conn.execute("DELETE FROM papers")
|
||||
conn.execute("DELETE FROM papers_fts")
|
||||
|
||||
# Reindex all papers from storage
|
||||
for metadata in storage_manager.list_all_papers():
|
||||
try:
|
||||
self.index_paper(metadata)
|
||||
success_count += 1
|
||||
except Exception:
|
||||
error_count += 1
|
||||
|
||||
return success_count, error_count
|
||||
@@ -0,0 +1,17 @@
|
||||
"""Data models for paperlib."""
|
||||
|
||||
from .paper import (
|
||||
ConversionStatus,
|
||||
PaperMetadata,
|
||||
PaperSummary,
|
||||
SourceType,
|
||||
SummaryStatus,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ConversionStatus",
|
||||
"PaperMetadata",
|
||||
"PaperSummary",
|
||||
"SourceType",
|
||||
"SummaryStatus",
|
||||
]
|
||||
@@ -0,0 +1,164 @@
|
||||
"""Data models for paper metadata and summaries."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from datetime import datetime
|
||||
from enum import StrEnum
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
class ConversionStatus(StrEnum):
|
||||
"""Status of PDF to Markdown conversion."""
|
||||
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
SUCCESS = "success"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class SummaryStatus(StrEnum):
|
||||
"""Status of AI summarization."""
|
||||
|
||||
PENDING = "pending"
|
||||
PROCESSING = "processing"
|
||||
SUCCESS = "success"
|
||||
FAILED = "failed"
|
||||
NOT_REQUESTED = "not_requested"
|
||||
|
||||
|
||||
class SourceType(StrEnum):
|
||||
"""Type of paper source."""
|
||||
|
||||
LOCAL = "local"
|
||||
ARXIV = "arxiv"
|
||||
|
||||
|
||||
@dataclass
|
||||
class PaperMetadata:
|
||||
"""Metadata for a paper (stored in meta.json)."""
|
||||
|
||||
# Core identifiers
|
||||
paper_id: str
|
||||
source_type: SourceType
|
||||
source_id: str | None = None # arXiv ID or local file hash
|
||||
|
||||
# Bibliographic information
|
||||
title: str = ""
|
||||
authors: list[str] = field(default_factory=list)
|
||||
published_date: datetime | None = None
|
||||
updated_date: datetime | None = None
|
||||
categories: list[str] = field(default_factory=list)
|
||||
|
||||
# File paths (relative to library root)
|
||||
pdf_path: str | None = None
|
||||
paper_md_path: str | None = None
|
||||
summary_json_path: str | None = None
|
||||
summary_md_path: str | None = None
|
||||
|
||||
# Processing status
|
||||
imported_at: datetime = field(default_factory=datetime.now)
|
||||
conversion_status: ConversionStatus = ConversionStatus.PENDING
|
||||
summary_status: SummaryStatus = SummaryStatus.NOT_REQUESTED
|
||||
|
||||
# Additional metadata
|
||||
tags: list[str] = field(default_factory=list)
|
||||
notes: str = ""
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
data = asdict(self)
|
||||
# Convert datetime objects to ISO format strings
|
||||
for field_name in ["published_date", "updated_date", "imported_at"]:
|
||||
if data[field_name] is not None:
|
||||
data[field_name] = data[field_name].isoformat()
|
||||
# Convert enums to strings
|
||||
data["source_type"] = self.source_type.value
|
||||
data["conversion_status"] = self.conversion_status.value
|
||||
data["summary_status"] = self.summary_status.value
|
||||
return data
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict[str, Any]) -> PaperMetadata:
|
||||
"""Create from dictionary (JSON deserialization)."""
|
||||
# Convert ISO format strings back to datetime objects
|
||||
for field_name in ["published_date", "updated_date", "imported_at"]:
|
||||
if data.get(field_name):
|
||||
data[field_name] = datetime.fromisoformat(data[field_name])
|
||||
# Convert strings back to enums
|
||||
if "source_type" in data:
|
||||
data["source_type"] = SourceType(data["source_type"])
|
||||
if "conversion_status" in data:
|
||||
data["conversion_status"] = ConversionStatus(data["conversion_status"])
|
||||
if "summary_status" in data:
|
||||
data["summary_status"] = SummaryStatus(data["summary_status"])
|
||||
return cls(**data)
|
||||
|
||||
def save_to_file(self, file_path: Path) -> None:
|
||||
"""Save metadata to a JSON file atomically."""
|
||||
# Write to temporary file first, then move (atomic operation)
|
||||
temp_path = file_path.with_suffix(".tmp")
|
||||
with temp_path.open("w") as f:
|
||||
json.dump(self.to_dict(), f, indent=2)
|
||||
temp_path.rename(file_path)
|
||||
|
||||
@classmethod
|
||||
def load_from_file(cls, file_path: Path) -> PaperMetadata:
|
||||
"""Load metadata from a JSON file."""
|
||||
with file_path.open() as f:
|
||||
data = json.load(f)
|
||||
return cls.from_dict(data)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PaperSummary:
|
||||
"""Structured summary for a paper (stored in summary.json)."""
|
||||
|
||||
# Schema version for migration
|
||||
schema_version: str = "1.0"
|
||||
|
||||
# Core summary fields
|
||||
one_sentence_summary: str = ""
|
||||
problem_statement: str = ""
|
||||
method_overview: str = ""
|
||||
main_results: str = ""
|
||||
claimed_contributions: list[str] = field(default_factory=list)
|
||||
assumptions: list[str] = field(default_factory=list)
|
||||
limitations: list[str] = field(default_factory=list)
|
||||
|
||||
# Categorization
|
||||
problem_tags: list[str] = field(default_factory=list)
|
||||
technique_tags: list[str] = field(default_factory=list)
|
||||
|
||||
# Entities mentioned
|
||||
entities: list[str] = field(default_factory=list)
|
||||
|
||||
# Relevance scoring (optional)
|
||||
relevance_to_user: float | None = None
|
||||
recommended_sections: list[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert to dictionary for JSON serialization."""
|
||||
return asdict(self)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict[str, Any]) -> PaperSummary:
|
||||
"""Create from dictionary (JSON deserialization)."""
|
||||
return cls(**data)
|
||||
|
||||
def save_to_file(self, file_path: Path) -> None:
|
||||
"""Save summary to a JSON file atomically."""
|
||||
# Write to temporary file first, then move (atomic operation)
|
||||
temp_path = file_path.with_suffix(".tmp")
|
||||
with temp_path.open("w") as f:
|
||||
json.dump(self.to_dict(), f, indent=2)
|
||||
temp_path.rename(file_path)
|
||||
|
||||
@classmethod
|
||||
def load_from_file(cls, file_path: Path) -> PaperSummary:
|
||||
"""Load summary from a JSON file."""
|
||||
with file_path.open() as f:
|
||||
data = json.load(f)
|
||||
return cls.from_dict(data)
|
||||
@@ -0,0 +1,5 @@
|
||||
"""Storage layer for paperlib."""
|
||||
|
||||
from .manager import PaperStorageManager
|
||||
|
||||
__all__ = ["PaperStorageManager"]
|
||||
@@ -0,0 +1,183 @@
|
||||
"""Paper storage manager for CRUD operations on metadata files."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import shutil
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from paperlib.config import LibraryPaths
|
||||
from paperlib.models import PaperMetadata, PaperSummary, SourceType
|
||||
|
||||
|
||||
class PaperStorageManager:
|
||||
"""Manages storage and retrieval of papers and their metadata."""
|
||||
|
||||
def __init__(self, library_paths: LibraryPaths) -> None:
|
||||
self.library_paths = library_paths
|
||||
|
||||
def generate_paper_id(
|
||||
self,
|
||||
source_type: SourceType,
|
||||
source_id: str | None = None,
|
||||
pdf_path: Path | None = None,
|
||||
) -> str:
|
||||
"""Generate a stable paper ID based on source type and content."""
|
||||
if source_type == SourceType.ARXIV and source_id:
|
||||
# Use arXiv ID directly (normalized)
|
||||
return f"arxiv-{source_id.replace('.', '_').replace('v', '_v')}"
|
||||
elif source_type == SourceType.LOCAL and pdf_path:
|
||||
# Use SHA256 hash of PDF file content
|
||||
with pdf_path.open("rb") as f:
|
||||
content = f.read()
|
||||
hash_hex = hashlib.sha256(content).hexdigest()
|
||||
return f"local-{hash_hex[:16]}" # Use first 16 chars of hash
|
||||
else:
|
||||
msg = "Cannot generate paper ID without proper source information"
|
||||
raise ValueError(msg)
|
||||
|
||||
def get_paper_directory(self, paper_id: str, source_type: SourceType) -> Path:
|
||||
"""Get the directory path for storing a paper's files."""
|
||||
if source_type == SourceType.ARXIV:
|
||||
# Extract year from arXiv ID pattern (e.g., "2212.06340" -> "2022")
|
||||
arxiv_id = paper_id.replace("arxiv-", "").replace("_", ".")
|
||||
year_part = arxiv_id[:4]
|
||||
# Modern arXiv format: YYMM.NNNNN
|
||||
if len(year_part) == 4 and year_part.isdigit():
|
||||
year = year_part
|
||||
else:
|
||||
# Fallback to current year for older formats
|
||||
year = str(datetime.now().year)
|
||||
return self.library_paths.papers_dir / "arxiv" / year / paper_id
|
||||
else:
|
||||
# Local papers go under papers/local/{hash-prefix}/
|
||||
hash_part = paper_id.replace("local-", "")
|
||||
return self.library_paths.papers_dir / "local" / hash_part
|
||||
|
||||
def get_paper_paths(
|
||||
self, paper_id: str, source_type: SourceType
|
||||
) -> dict[str, Path]:
|
||||
"""Get all expected file paths for a paper."""
|
||||
paper_dir = self.get_paper_directory(paper_id, source_type)
|
||||
return {
|
||||
"directory": paper_dir,
|
||||
"meta": paper_dir / "meta.json",
|
||||
"pdf": paper_dir / "source.pdf",
|
||||
"markdown": paper_dir / "paper.md",
|
||||
"summary_json": paper_dir / "summary.json",
|
||||
"summary_md": paper_dir / "summary.md",
|
||||
"assets": paper_dir / "assets",
|
||||
"logs": paper_dir / "logs",
|
||||
}
|
||||
|
||||
def store_paper(
|
||||
self,
|
||||
pdf_path: Path,
|
||||
source_type: SourceType,
|
||||
source_id: str | None = None,
|
||||
**metadata_kwargs,
|
||||
) -> PaperMetadata:
|
||||
"""Store a paper and create its metadata."""
|
||||
# Generate paper ID
|
||||
paper_id = self.generate_paper_id(source_type, source_id, pdf_path)
|
||||
|
||||
# Get storage paths
|
||||
paths = self.get_paper_paths(paper_id, source_type)
|
||||
|
||||
# Create directory structure
|
||||
paths["directory"].mkdir(parents=True, exist_ok=True)
|
||||
paths["assets"].mkdir(exist_ok=True)
|
||||
paths["logs"].mkdir(exist_ok=True)
|
||||
|
||||
# Copy PDF to storage
|
||||
shutil.copy2(pdf_path, paths["pdf"])
|
||||
|
||||
# Create metadata
|
||||
metadata = PaperMetadata(
|
||||
paper_id=paper_id,
|
||||
source_type=source_type,
|
||||
source_id=source_id,
|
||||
pdf_path=str(paths["pdf"].relative_to(self.library_paths.root)),
|
||||
paper_md_path=str(paths["markdown"].relative_to(self.library_paths.root)),
|
||||
summary_json_path=str(
|
||||
paths["summary_json"].relative_to(self.library_paths.root)
|
||||
),
|
||||
summary_md_path=str(
|
||||
paths["summary_md"].relative_to(self.library_paths.root)
|
||||
),
|
||||
**metadata_kwargs,
|
||||
)
|
||||
|
||||
# Save metadata
|
||||
metadata.save_to_file(paths["meta"])
|
||||
|
||||
return metadata
|
||||
|
||||
def load_paper_metadata(
|
||||
self, paper_id: str, source_type: SourceType
|
||||
) -> PaperMetadata | None:
|
||||
"""Load paper metadata from storage."""
|
||||
paths = self.get_paper_paths(paper_id, source_type)
|
||||
if not paths["meta"].exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
return PaperMetadata.load_from_file(paths["meta"])
|
||||
except (FileNotFoundError, ValueError):
|
||||
return None
|
||||
|
||||
def update_paper_metadata(self, metadata: PaperMetadata) -> None:
|
||||
"""Update paper metadata in storage."""
|
||||
paths = self.get_paper_paths(metadata.paper_id, metadata.source_type)
|
||||
metadata.save_to_file(paths["meta"])
|
||||
|
||||
def load_paper_summary(
|
||||
self, paper_id: str, source_type: SourceType
|
||||
) -> PaperSummary | None:
|
||||
"""Load paper summary from storage."""
|
||||
paths = self.get_paper_paths(paper_id, source_type)
|
||||
if not paths["summary_json"].exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
return PaperSummary.load_from_file(paths["summary_json"])
|
||||
except (FileNotFoundError, ValueError):
|
||||
return None
|
||||
|
||||
def save_paper_summary(
|
||||
self, paper_id: str, source_type: SourceType, summary: PaperSummary
|
||||
) -> None:
|
||||
"""Save paper summary to storage."""
|
||||
paths = self.get_paper_paths(paper_id, source_type)
|
||||
summary.save_to_file(paths["summary_json"])
|
||||
|
||||
def list_all_papers(self) -> Iterator[PaperMetadata]:
|
||||
"""Iterate over all papers in the library."""
|
||||
papers_dir = self.library_paths.papers_dir
|
||||
if not papers_dir.exists():
|
||||
return
|
||||
|
||||
# Look for meta.json files in the papers directory structure
|
||||
for meta_file in papers_dir.rglob("meta.json"):
|
||||
try:
|
||||
yield PaperMetadata.load_from_file(meta_file)
|
||||
except (ValueError, FileNotFoundError):
|
||||
# Skip corrupted metadata files
|
||||
continue
|
||||
|
||||
def paper_exists(self, paper_id: str, source_type: SourceType) -> bool:
|
||||
"""Check if a paper already exists in storage."""
|
||||
paths = self.get_paper_paths(paper_id, source_type)
|
||||
return paths["meta"].exists()
|
||||
|
||||
def delete_paper(self, paper_id: str, source_type: SourceType) -> bool:
|
||||
"""Delete a paper and all its files."""
|
||||
paths = self.get_paper_paths(paper_id, source_type)
|
||||
if not paths["directory"].exists():
|
||||
return False
|
||||
|
||||
# Remove entire paper directory
|
||||
shutil.rmtree(paths["directory"])
|
||||
return True
|
||||
@@ -97,6 +97,19 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "arxiv"
|
||||
version = "3.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "feedparser" },
|
||||
{ name = "requests" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ff/78/1e93a001ed51b5114e1978247078fa3130cbb2794a520603949cbe9a7028/arxiv-3.0.0.tar.gz", hash = "sha256:c8cb0d31208afbc1ceb17bd3f9816c8d4c5ca1e0abf199d211e216715440498d", size = 67344, upload-time = "2026-04-12T22:48:59.623Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/9d/0d/bb2ef604e5548ba73ba6326576908d8285ebf3468b02b86af83381c7c973/arxiv-3.0.0-py3-none-any.whl", hash = "sha256:8b4d4e2e336bfeb71ea653623d7dadb260f682f0475cee2aecad0560a23b34db", size = 11928, upload-time = "2026-04-12T22:48:58.44Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "audioop-lts"
|
||||
version = "0.2.2"
|
||||
@@ -502,6 +515,18 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/31/fb/6d251f3fdfe3346ee60d091f55106513e509659ee005ad39c914182c96f4/fasttext_predict-0.9.2.4-cp313-cp313t-win_amd64.whl", hash = "sha256:be0933fa4af7abae09c703d28f9e17c80e7069eb6f92100b21985b777f4ea275", size = 110325, upload-time = "2024-11-23T17:24:16.984Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "feedparser"
|
||||
version = "6.0.12"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "sgmllib3k" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/dc/79/db7edb5e77d6dfbc54d7d9df72828be4318275b2e580549ff45a962f6461/feedparser-6.0.12.tar.gz", hash = "sha256:64f76ce90ae3e8ef5d1ede0f8d3b50ce26bcce71dd8ae5e82b1cd2d4a5f94228", size = 286579, upload-time = "2025-09-10T13:33:59.486Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/4e/eb/c96d64137e29ae17d83ad2552470bafe3a7a915e85434d9942077d7fd011/feedparser-6.0.12-py3-none-any.whl", hash = "sha256:6bbff10f5a52662c00a2e3f86a38928c37c48f77b3c511aedcd51de933549324", size = 81480, upload-time = "2025-09-10T13:33:58.022Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ffmpy"
|
||||
version = "1.0.0"
|
||||
@@ -1382,6 +1407,7 @@ name = "paperlib"
|
||||
version = "0.1.0"
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "arxiv" },
|
||||
{ name = "mineru", extra = ["core"] },
|
||||
{ name = "rich" },
|
||||
{ name = "typer" },
|
||||
@@ -1389,6 +1415,7 @@ dependencies = [
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "arxiv", specifier = ">=2.0.0" },
|
||||
{ name = "mineru", extras = ["core"], specifier = ">=3.0.9" },
|
||||
{ name = "rich", specifier = ">=15.0.0" },
|
||||
{ name = "typer", specifier = ">=0.24.1" },
|
||||
@@ -1947,6 +1974,12 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e1/e3/c164c88b2e5ce7b24d667b9bd83589cf4f3520d97cad01534cd3c4f55fdb/setuptools-81.0.0-py3-none-any.whl", hash = "sha256:fdd925d5c5d9f62e4b74b30d6dd7828ce236fd6ed998a08d81de62ce5a6310d6", size = 1062021, upload-time = "2026-02-06T21:10:37.175Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sgmllib3k"
|
||||
version = "1.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9", size = 5750, upload-time = "2010-08-24T14:33:52.445Z" }
|
||||
|
||||
[[package]]
|
||||
name = "shapely"
|
||||
version = "2.1.2"
|
||||
|
||||
Reference in New Issue
Block a user