From 82e4ed6fec25e303a3802867394c01e1695e181c Mon Sep 17 00:00:00 2001 From: Yingjie Wang Date: Fri, 17 Apr 2026 14:40:46 -0400 Subject: [PATCH] update: add core functionality --- pyproject.toml | 1 + src/paperlib/__main__.py | 1 - src/paperlib/cli.py | 422 ++++++++++++++++----- src/paperlib/config.py | 3 +- src/paperlib/converter/__init__.py | 5 + src/paperlib/converter/mineru_converter.py | 134 +++++++ src/paperlib/importer/__init__.py | 6 + src/paperlib/importer/arxiv_importer.py | 112 ++++++ src/paperlib/importer/local_importer.py | 56 +++ src/paperlib/index/__init__.py | 5 + src/paperlib/index/database.py | 321 ++++++++++++++++ src/paperlib/models/__init__.py | 17 + src/paperlib/models/paper.py | 164 ++++++++ src/paperlib/storage/__init__.py | 5 + src/paperlib/storage/manager.py | 183 +++++++++ uv.lock | 33 ++ 16 files changed, 1379 insertions(+), 89 deletions(-) create mode 100644 src/paperlib/converter/__init__.py create mode 100644 src/paperlib/converter/mineru_converter.py create mode 100644 src/paperlib/importer/__init__.py create mode 100644 src/paperlib/importer/arxiv_importer.py create mode 100644 src/paperlib/importer/local_importer.py create mode 100644 src/paperlib/index/__init__.py create mode 100644 src/paperlib/index/database.py create mode 100644 src/paperlib/models/__init__.py create mode 100644 src/paperlib/models/paper.py create mode 100644 src/paperlib/storage/__init__.py create mode 100644 src/paperlib/storage/manager.py diff --git a/pyproject.toml b/pyproject.toml index 0a6ee8f..796047f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ description = "Local-first CLI toolkit for managing a paper library" readme = "README.md" requires-python = ">=3.13,<3.14" dependencies = [ + "arxiv>=2.0.0", "mineru[core]>=3.0.9", "rich>=15.0.0", "typer>=0.24.1", diff --git a/src/paperlib/__main__.py b/src/paperlib/__main__.py index 997131e..f996b22 100644 --- a/src/paperlib/__main__.py +++ b/src/paperlib/__main__.py @@ -2,6 +2,5 @@ from paperlib.cli import main - if __name__ == "__main__": main() diff --git a/src/paperlib/cli.py b/src/paperlib/cli.py index 31244d8..e601738 100644 --- a/src/paperlib/cli.py +++ b/src/paperlib/cli.py @@ -7,120 +7,370 @@ from pathlib import Path from paperlib import __version__ from paperlib.config import LibraryPaths +from paperlib.converter import MinerUConverter +from paperlib.importer import ArxivImporter, LocalImporter +from paperlib.index import DatabaseManager +from paperlib.storage import PaperStorageManager def _resolve_library_root(path: Path | None) -> Path: - """Resolve the target library root, defaulting to the current directory.""" - return (path or Path.cwd()).expanduser() + """Resolve the target library root, defaulting to the current directory.""" + return (path or Path.cwd()).expanduser() def _build_parser() -> argparse.ArgumentParser: - """Create the top-level argument parser.""" - parser = argparse.ArgumentParser( - prog="paperlib", - description="Local-first paper library engine with a CLI.", - ) - parser.add_argument( - "--version", - action="version", - version=f"%(prog)s {__version__}", - ) - subparsers = parser.add_subparsers(dest="command", metavar="COMMAND") + """Create the top-level argument parser.""" + parser = argparse.ArgumentParser( + prog="paperlib", + description="Local-first paper library engine with a CLI.", + ) + parser.add_argument( + "--version", + action="version", + version=f"%(prog)s {__version__}", + ) + subparsers = parser.add_subparsers(dest="command", metavar="COMMAND") - init_parser = subparsers.add_parser( - "init", - help="Initialize a paper library directory.", - ) - init_parser.add_argument( - "path", - nargs="?", - default=".", - help="Directory where the library should be initialized.", - ) - init_parser.set_defaults(handler=_handle_init) + init_parser = subparsers.add_parser( + "init", + help="Initialize a paper library directory.", + ) + init_parser.add_argument( + "path", + nargs="?", + default=".", + help="Directory where the library should be initialized.", + ) + init_parser.set_defaults(handler=_handle_init) - status_parser = subparsers.add_parser( - "status", - help="Show the resolved library layout for the selected root.", - ) - status_parser.add_argument( - "--library", - "-L", - default=".", - help="Library root to inspect. Defaults to the current directory.", - ) - status_parser.set_defaults(handler=_handle_status) + status_parser = subparsers.add_parser( + "status", + help="Show the resolved library layout for the selected root.", + ) + status_parser.add_argument( + "--library", + "-L", + default=".", + help="Library root to inspect. Defaults to the current directory.", + ) + status_parser.set_defaults(handler=_handle_status) - list_parser = subparsers.add_parser("list", help="List imported papers.") - list_parser.set_defaults(handler=_handle_list) + list_parser = subparsers.add_parser("list", help="List imported papers.") + list_parser.add_argument("--library", "-L", default=".", help="Library root") + list_parser.set_defaults(handler=_handle_list) - show_parser = subparsers.add_parser( - "show", - help="Show detailed information for a paper.", - ) - show_parser.set_defaults(handler=_handle_show) + show_parser = subparsers.add_parser( + "show", + help="Show detailed information for a paper.", + ) + show_parser.add_argument("paper_id", help="Paper ID to show") + show_parser.add_argument("--library", "-L", default=".", help="Library root") + show_parser.set_defaults(handler=_handle_show) - search_parser = subparsers.add_parser( - "search", - help="Search the paper library.", - ) - search_parser.set_defaults(handler=_handle_search) + search_parser = subparsers.add_parser( + "search", + help="Search the paper library.", + ) + search_parser.set_defaults(handler=_handle_search) - return parser + # Import command + import_parser = subparsers.add_parser( + "import", + help="Import a paper into the library.", + ) + import_group = import_parser.add_mutually_exclusive_group(required=True) + import_group.add_argument("--pdf", type=Path, help="Path to a local PDF file") + import_group.add_argument("--arxiv", type=str, help="arXiv ID or URL") + import_parser.add_argument("--title", type=str, help="Title for local PDFs") + import_parser.add_argument("--notes", type=str, default="", help="Notes") + import_parser.add_argument("--tags", nargs="*", default=[], help="Tags") + import_parser.add_argument("--library", "-L", default=".", help="Library root") + import_parser.set_defaults(handler=_handle_import) + + # Convert command + convert_parser = subparsers.add_parser( + "convert", + help="Convert papers to Markdown.", + ) + convert_parser.add_argument("--library", "-L", default=".", help="Library root") + convert_parser.add_argument("--paper-id", help="Convert specific paper by ID") + convert_parser.set_defaults(handler=_handle_convert) + + # Reindex command + reindex_parser = subparsers.add_parser( + "reindex", + help="Rebuild the search index from stored papers.", + ) + reindex_parser.add_argument("--library", "-L", default=".", help="Library root") + reindex_parser.set_defaults(handler=_handle_reindex) + + return parser def _format_paths(paths: LibraryPaths) -> str: - """Render library paths in a simple, grep-friendly format.""" - lines = [ - f"root: {paths.root}", - f"config: {paths.config_path}", - f"database: {paths.db_path}", - f"papers: {paths.papers_dir}", - f"inbox: {paths.inbox_dir}", - f"cache: {paths.cache_dir}", - ] - return "\n".join(lines) + """Render library paths in a simple, grep-friendly format.""" + lines = [ + f"root: {paths.root}", + f"config: {paths.config_path}", + f"database: {paths.db_path}", + f"papers: {paths.papers_dir}", + f"inbox: {paths.inbox_dir}", + f"cache: {paths.cache_dir}", + ] + return "\n".join(lines) def _handle_init(args: argparse.Namespace) -> int: - """Initialize a paper library directory.""" - paths = LibraryPaths.from_root(Path(args.path)) - paths.create_directories() - print(f"Initialized paper library at {paths.root}") - print(_format_paths(paths)) - return 0 + """Initialize a paper library directory.""" + paths = LibraryPaths.from_root(Path(args.path)) + paths.create_directories() + print(f"Initialized paper library at {paths.root}") + print(_format_paths(paths)) + return 0 def _handle_status(args: argparse.Namespace) -> int: - """Show the resolved library layout for a selected root.""" - paths = LibraryPaths.from_root(_resolve_library_root(Path(args.library))) - print(_format_paths(paths)) - return 0 + """Show the resolved library layout for a selected root.""" + paths = LibraryPaths.from_root(_resolve_library_root(Path(args.library))) + print(_format_paths(paths)) + return 0 -def _handle_list(_: argparse.Namespace) -> int: - """Placeholder for listing imported papers.""" - print("Listing papers is not implemented yet.") - return 0 +def _handle_list(args: argparse.Namespace) -> int: + """List imported papers.""" + try: + paths = LibraryPaths.from_root( + _resolve_library_root( + Path(args.library if hasattr(args, "library") else ".") + ) + ) + storage_manager = PaperStorageManager(paths) + db_manager = DatabaseManager(paths) + + # Initialize database if it doesn't exist + db_manager.initialize_database() + + # List all papers from storage (more reliable than index) + papers = list(storage_manager.list_all_papers()) + + if not papers: + print("No papers found in library.") + return 0 + + print(f"Found {len(papers)} papers:") + print() + + for metadata in papers: + status_indicators = [] + if metadata.conversion_status.value == "success": + status_indicators.append("📄") # Converted + if metadata.summary_status.value == "success": + status_indicators.append("📝") # Summarized + + status_str = "".join(status_indicators) if status_indicators else "⏳" + + print(f"{status_str} {metadata.paper_id}") + print(f" {metadata.title}") + if metadata.authors: + authors_str = ", ".join(metadata.authors[:3]) + if len(metadata.authors) > 3: + authors_str += f" (+{len(metadata.authors) - 3} more)" + print(f" By: {authors_str}") + if metadata.categories: + print(f" Categories: {', '.join(metadata.categories)}") + print() + + return 0 + + except Exception as e: + print(f"Error listing papers: {e}") + return 1 -def _handle_show(_: argparse.Namespace) -> int: - """Placeholder for showing paper details.""" - print("Showing paper details is not implemented yet.") - return 0 +def _handle_show(args: argparse.Namespace) -> int: + """Show detailed information for a paper.""" + if not hasattr(args, "paper_id") or not args.paper_id: + print("Please specify a paper ID with --paper-id") + return 1 + + try: + paths = LibraryPaths.from_root( + _resolve_library_root( + Path(args.library if hasattr(args, "library") else ".") + ) + ) + storage_manager = PaperStorageManager(paths) + + # Find paper by ID + for metadata in storage_manager.list_all_papers(): + if metadata.paper_id == args.paper_id: + print(f"Paper ID: {metadata.paper_id}") + print(f"Source: {metadata.source_type.value}") + if metadata.source_id: + print(f"Source ID: {metadata.source_id}") + print(f"Title: {metadata.title}") + if metadata.authors: + print(f"Authors: {', '.join(metadata.authors)}") + if metadata.published_date: + print(f"Published: {metadata.published_date.strftime('%Y-%m-%d')}") + if metadata.categories: + print(f"Categories: {', '.join(metadata.categories)}") + if metadata.tags: + print(f"Tags: {', '.join(metadata.tags)}") + print(f"Imported: {metadata.imported_at.strftime('%Y-%m-%d %H:%M:%S')}") + print(f"Conversion Status: {metadata.conversion_status.value}") + print(f"Summary Status: {metadata.summary_status.value}") + if metadata.notes: + print(f"Notes: {metadata.notes}") + + # Show file paths + print("\nFiles:") + if metadata.pdf_path: + pdf_path = paths.root / metadata.pdf_path + exists = "✓" if pdf_path.exists() else "✗" + print(f" PDF: {exists} {metadata.pdf_path}") + if metadata.paper_md_path: + md_path = paths.root / metadata.paper_md_path + exists = "✓" if md_path.exists() else "✗" + print(f" Markdown: {exists} {metadata.paper_md_path}") + if metadata.summary_json_path: + summary_path = paths.root / metadata.summary_json_path + exists = "✓" if summary_path.exists() else "✗" + print(f" Summary: {exists} {metadata.summary_json_path}") + + return 0 + + print(f"Paper not found: {args.paper_id}") + return 1 + + except Exception as e: + print(f"Error showing paper: {e}") + return 1 def _handle_search(_: argparse.Namespace) -> int: - """Placeholder for searching the paper library.""" - print("Search is not implemented yet.") - return 0 + """Placeholder for searching the paper library.""" + print("Search is not implemented yet.") + return 0 + + +def _handle_import(args: argparse.Namespace) -> int: + """Handle importing a paper into the library.""" + try: + # Set up library paths and managers + paths = LibraryPaths.from_root(_resolve_library_root(Path(args.library))) + storage_manager = PaperStorageManager(paths) + db_manager = DatabaseManager(paths) + + # Initialize database + db_manager.initialize_database() + + if args.pdf: + # Import local PDF + local_importer = LocalImporter(storage_manager) + metadata = local_importer.import_pdf( + pdf_path=args.pdf, + title=args.title or "", + notes=args.notes, + tags=args.tags, + ) + # Index the paper + db_manager.index_paper(metadata) + + print(f"Successfully imported local PDF: {metadata.paper_id}") + print(f"Title: {metadata.title}") + + elif args.arxiv: + # Import from arXiv + arxiv_importer = ArxivImporter(storage_manager) + metadata = arxiv_importer.import_arxiv_paper( + arxiv_input=args.arxiv, + notes=args.notes, + tags=args.tags, + ) + # Index the paper + db_manager.index_paper(metadata) + + print(f"Successfully imported arXiv paper: {metadata.paper_id}") + print(f"Title: {metadata.title}") + print(f"Authors: {', '.join(metadata.authors)}") + + return 0 + + except Exception as e: + print(f"Error importing paper: {e}") + return 1 + + +def _handle_convert(args: argparse.Namespace) -> int: + """Handle converting papers to Markdown.""" + try: + # Set up library paths and components + paths = LibraryPaths.from_root(_resolve_library_root(Path(args.library))) + storage_manager = PaperStorageManager(paths) + converter = MinerUConverter(storage_manager) + + if args.paper_id: + # Convert specific paper + for metadata in storage_manager.list_all_papers(): + if metadata.paper_id == args.paper_id: + if converter.convert_paper(metadata): + print(f"Successfully converted paper: {metadata.paper_id}") + else: + print(f"Failed to convert paper: {metadata.paper_id}") + return 0 + print(f"Paper not found: {args.paper_id}") + return 1 + else: + # Convert all pending papers + success_count, failure_count = converter.convert_all_pending() + msg = f"Complete: {success_count} successful, {failure_count} failed" + print(msg) + return 0 if failure_count == 0 else 1 + + except Exception as e: + print(f"Error during conversion: {e}") + return 1 + + +def _handle_reindex(args: argparse.Namespace) -> int: + """Rebuild the search index from stored papers.""" + try: + paths = LibraryPaths.from_root(_resolve_library_root(Path(args.library))) + storage_manager = PaperStorageManager(paths) + db_manager = DatabaseManager(paths) + + print("Rebuilding search index...") + + # Initialize database schema + db_manager.initialize_database() + + # Rebuild index from storage + success_count, error_count = db_manager.reindex_from_storage(storage_manager) + + print(f"Reindex complete: {success_count} papers indexed, {error_count} errors") + + # Show statistics + stats = db_manager.get_statistics() + print(f"Total papers: {stats['total_papers']}") + if stats.get("by_source_type"): + by_source = ", ".join( + f"{k}: {v}" for k, v in stats["by_source_type"].items() + ) + print(f"By source: {by_source}") + + return 0 if error_count == 0 else 1 + + except Exception as e: + print(f"Error during reindex: {e}") + return 1 def main() -> None: - """Console script entrypoint.""" - parser = _build_parser() - args = parser.parse_args() - if not hasattr(args, "handler"): - parser.print_help() - raise SystemExit(0) - raise SystemExit(args.handler(args)) + """Console script entrypoint.""" + parser = _build_parser() + args = parser.parse_args() + if not hasattr(args, "handler"): + parser.print_help() + raise SystemExit(0) + raise SystemExit(args.handler(args)) diff --git a/src/paperlib/config.py b/src/paperlib/config.py index 2db4558..0b0be6e 100644 --- a/src/paperlib/config.py +++ b/src/paperlib/config.py @@ -5,7 +5,6 @@ from __future__ import annotations from dataclasses import dataclass from pathlib import Path - DEFAULT_CONFIG_DIRNAME = "config" DEFAULT_DB_DIRNAME = "db" DEFAULT_CACHE_DIRNAME = "cache" @@ -29,7 +28,7 @@ class LibraryPaths: config_path: Path @classmethod - def from_root(cls, root: Path) -> "LibraryPaths": + def from_root(cls, root: Path) -> LibraryPaths: """Build a standard library layout from a root directory.""" resolved_root = root.expanduser().resolve() config_dir = resolved_root / DEFAULT_CONFIG_DIRNAME diff --git a/src/paperlib/converter/__init__.py b/src/paperlib/converter/__init__.py new file mode 100644 index 0000000..6b4bded --- /dev/null +++ b/src/paperlib/converter/__init__.py @@ -0,0 +1,5 @@ +"""PDF conversion functionality for paperlib.""" + +from .mineru_converter import MinerUConverter + +__all__ = ["MinerUConverter"] diff --git a/src/paperlib/converter/mineru_converter.py b/src/paperlib/converter/mineru_converter.py new file mode 100644 index 0000000..9836e81 --- /dev/null +++ b/src/paperlib/converter/mineru_converter.py @@ -0,0 +1,134 @@ +"""PDF to Markdown conversion using MinerU.""" + +from __future__ import annotations + +import logging +import subprocess +import sys + +from paperlib.models import ConversionStatus, PaperMetadata +from paperlib.storage import PaperStorageManager + + +class MinerUConverter: + """Handles PDF to Markdown conversion using MinerU.""" + + def __init__(self, storage_manager: PaperStorageManager) -> None: + self.storage_manager = storage_manager + self.logger = logging.getLogger(__name__) + + def is_mineru_available(self) -> bool: + """Check if MinerU is available in the environment.""" + try: + result = subprocess.run( + [sys.executable, "-c", "import mineru"], + capture_output=True, + check=False, + ) + return result.returncode == 0 + except (subprocess.SubprocessError, FileNotFoundError): + return False + + def convert_paper(self, metadata: PaperMetadata) -> bool: + """Convert a paper's PDF to Markdown using MinerU.""" + if not self.is_mineru_available(): + self.logger.error("MinerU is not available") + return False + + # Get paper paths + paths = self.storage_manager.get_paper_paths( + metadata.paper_id, metadata.source_type + ) + pdf_path = self.storage_manager.library_paths.root / metadata.pdf_path + markdown_path = paths["markdown"] + logs_dir = paths["logs"] + + if not pdf_path.exists(): + self.logger.error(f"PDF file not found: {pdf_path}") + return False + + # Update status to processing + metadata.conversion_status = ConversionStatus.PROCESSING + self.storage_manager.update_paper_metadata(metadata) + + try: + # Run MinerU conversion + log_file = logs_dir / "mineru.log" + + # MinerU command + cmd = [ + sys.executable, + "-m", + "magic_pdf.pipe.UNIPipe", + "--pdf", + str(pdf_path), + "--output-dir", + str(paths["directory"]), + ] + + self.logger.info(f"Running MinerU conversion: {' '.join(cmd)}") + + with log_file.open("w") as log: + result = subprocess.run( + cmd, + stdout=log, + stderr=subprocess.STDOUT, + cwd=paths["directory"], + check=False, + ) + + # Check if conversion was successful + if result.returncode == 0: + # MinerU typically outputs markdown files, try to find the main one + # Look for common output patterns + markdown_candidates = list(paths["directory"].glob("*.md")) + if not markdown_candidates: + # Try subdirectories + markdown_candidates = list(paths["directory"].rglob("*.md")) + + if markdown_candidates: + # Use the first markdown file found, or rename if needed + main_md = markdown_candidates[0] + if main_md != markdown_path: + main_md.rename(markdown_path) + + # Update metadata + metadata.conversion_status = ConversionStatus.SUCCESS + self.storage_manager.update_paper_metadata(metadata) + + self.logger.info( + f"Successfully converted {pdf_path} to {markdown_path}" + ) + return True + else: + self.logger.error("No markdown output found after conversion") + metadata.conversion_status = ConversionStatus.FAILED + self.storage_manager.update_paper_metadata(metadata) + return False + else: + self.logger.error( + f"MinerU conversion failed with return code {result.returncode}" + ) + metadata.conversion_status = ConversionStatus.FAILED + self.storage_manager.update_paper_metadata(metadata) + return False + + except Exception as e: + self.logger.error(f"Exception during conversion: {e}") + metadata.conversion_status = ConversionStatus.FAILED + self.storage_manager.update_paper_metadata(metadata) + return False + + def convert_all_pending(self) -> tuple[int, int]: + """Convert all papers with pending conversion status.""" + success_count = 0 + failure_count = 0 + + for metadata in self.storage_manager.list_all_papers(): + if metadata.conversion_status == ConversionStatus.PENDING: + if self.convert_paper(metadata): + success_count += 1 + else: + failure_count += 1 + + return success_count, failure_count diff --git a/src/paperlib/importer/__init__.py b/src/paperlib/importer/__init__.py new file mode 100644 index 0000000..a98a1d6 --- /dev/null +++ b/src/paperlib/importer/__init__.py @@ -0,0 +1,6 @@ +"""Import functionality for paperlib.""" + +from .arxiv_importer import ArxivImporter +from .local_importer import LocalImporter + +__all__ = ["ArxivImporter", "LocalImporter"] diff --git a/src/paperlib/importer/arxiv_importer.py b/src/paperlib/importer/arxiv_importer.py new file mode 100644 index 0000000..251c31a --- /dev/null +++ b/src/paperlib/importer/arxiv_importer.py @@ -0,0 +1,112 @@ +"""arXiv import functionality.""" + +from __future__ import annotations + +import re +import tempfile +from pathlib import Path + +import arxiv + +from paperlib.models import PaperMetadata, SourceType +from paperlib.storage import PaperStorageManager + + +class ArxivImporter: + """Handles importing papers from arXiv.""" + + def __init__(self, storage_manager: PaperStorageManager) -> None: + self.storage_manager = storage_manager + # Create arXiv client with reasonable defaults + self.client = arxiv.Client(page_size=10, delay_seconds=3.0, num_retries=3) + + def extract_arxiv_id(self, input_string: str) -> str: + """Extract arXiv ID from various input formats.""" + # Clean input + input_string = input_string.strip() + + # Pattern for arXiv ID (both old and new formats) + # New format: YYMM.NNNNN[vN] + # Old format: subject-class/YYMMnnn + patterns = [ + r"(?:arxiv:)?(\d{4}\.\d{4,5}(?:v\d+)?)", # New format + r"(?:arxiv:)?([a-z-]+/\d{7})", # Old format + ] + + for pattern in patterns: + match = re.search(pattern, input_string, re.IGNORECASE) + if match: + return match.group(1) + + # If no pattern matches, assume it's already a clean arXiv ID + return input_string + + def fetch_paper_metadata(self, arxiv_id: str) -> arxiv.Result: + """Fetch paper metadata from arXiv API.""" + search = arxiv.Search(id_list=[arxiv_id]) + + results = list(self.client.results(search)) + if not results: + msg = f"Paper not found on arXiv: {arxiv_id}" + raise ValueError(msg) + + return results[0] + + def download_pdf(self, result: arxiv.Result) -> Path: + """Download PDF from arXiv to a temporary location.""" + with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: + tmp_path = Path(tmp_file.name) + + # Download PDF + result.download_pdf(filename=str(tmp_path)) + + return tmp_path + + def import_arxiv_paper( + self, arxiv_input: str, notes: str = "", tags: list[str] | None = None + ) -> PaperMetadata: + """Import a paper from arXiv.""" + # Extract clean arXiv ID + arxiv_id = self.extract_arxiv_id(arxiv_input) + + # Check if already imported + paper_id = self.storage_manager.generate_paper_id(SourceType.ARXIV, arxiv_id) + if self.storage_manager.paper_exists(paper_id, SourceType.ARXIV): + msg = f"Paper already imported: {arxiv_id}" + raise ValueError(msg) + + # Fetch metadata from arXiv + result = self.fetch_paper_metadata(arxiv_id) + + # Download PDF + pdf_path = self.download_pdf(result) + + try: + # Convert arXiv result to our metadata format + published_date = ( + result.published.replace(tzinfo=None) if result.published else None + ) + updated_date = ( + result.updated.replace(tzinfo=None) if result.updated else None + ) + + # Store the paper + metadata = self.storage_manager.store_paper( + pdf_path=pdf_path, + source_type=SourceType.ARXIV, + source_id=arxiv_id, + title=result.title, + authors=[author.name for author in result.authors], + published_date=published_date, + updated_date=updated_date, + categories=[cat for cat in result.categories], + notes=notes, + tags=tags or [], + ) + + return metadata + + finally: + # Clean up temporary PDF file + if pdf_path.exists(): + pdf_path.unlink() diff --git a/src/paperlib/importer/local_importer.py b/src/paperlib/importer/local_importer.py new file mode 100644 index 0000000..d19d023 --- /dev/null +++ b/src/paperlib/importer/local_importer.py @@ -0,0 +1,56 @@ +"""Local PDF import functionality.""" + +from __future__ import annotations + +from pathlib import Path + +from paperlib.models import PaperMetadata, SourceType +from paperlib.storage import PaperStorageManager + + +class LocalImporter: + """Handles importing local PDF files.""" + + def __init__(self, storage_manager: PaperStorageManager) -> None: + self.storage_manager = storage_manager + + def import_pdf( + self, + pdf_path: Path, + title: str = "", + notes: str = "", + tags: list[str] | None = None, + ) -> PaperMetadata: + """Import a local PDF file.""" + if not pdf_path.exists(): + msg = f"PDF file not found: {pdf_path}" + raise FileNotFoundError(msg) + + if not pdf_path.suffix.lower() == ".pdf": + msg = f"File is not a PDF: {pdf_path}" + raise ValueError(msg) + + # Generate paper ID and check for duplicates + paper_id = self.storage_manager.generate_paper_id( + SourceType.LOCAL, pdf_path=pdf_path + ) + + if self.storage_manager.paper_exists(paper_id, SourceType.LOCAL): + msg = f"Paper already imported: {paper_id}" + raise ValueError(msg) + + # Extract title from filename if not provided + if not title: + title = pdf_path.stem.replace("_", " ").replace("-", " ").title() + + # Store the paper + metadata = self.storage_manager.store_paper( + pdf_path=pdf_path, + source_type=SourceType.LOCAL, + source_id=None, + title=title, + notes=notes, + tags=tags or [], + ) + + return metadata diff --git a/src/paperlib/index/__init__.py b/src/paperlib/index/__init__.py new file mode 100644 index 0000000..9a4f334 --- /dev/null +++ b/src/paperlib/index/__init__.py @@ -0,0 +1,5 @@ +"""SQLite index layer for paperlib.""" + +from .database import DatabaseManager + +__all__ = ["DatabaseManager"] diff --git a/src/paperlib/index/database.py b/src/paperlib/index/database.py new file mode 100644 index 0000000..ab3cb30 --- /dev/null +++ b/src/paperlib/index/database.py @@ -0,0 +1,321 @@ +"""SQLite database manager for indexing papers.""" + +from __future__ import annotations + +import sqlite3 +from collections.abc import Iterator + +from paperlib.config import LibraryPaths +from paperlib.models import ConversionStatus, PaperMetadata, SourceType, SummaryStatus + + +class DatabaseManager: + """Manages SQLite database for indexing papers.""" + + def __init__(self, library_paths: LibraryPaths) -> None: + self.library_paths = library_paths + self.db_path = library_paths.db_path + + def _get_connection(self) -> sqlite3.Connection: + """Get a database connection with proper settings.""" + # Ensure database directory exists + self.db_path.parent.mkdir(parents=True, exist_ok=True) + + conn = sqlite3.connect(self.db_path) + conn.row_factory = sqlite3.Row # Enable dict-like access to rows + conn.execute("PRAGMA foreign_keys = ON") # Enable foreign keys + return conn + + def initialize_database(self) -> None: + """Initialize the database schema.""" + with self._get_connection() as conn: + # Main papers table + conn.execute(""" + CREATE TABLE IF NOT EXISTS papers ( + paper_id TEXT PRIMARY KEY, + source_type TEXT NOT NULL, + source_id TEXT, + title TEXT NOT NULL, + authors_json TEXT NOT NULL, -- JSON array of authors + published_date TEXT, -- ISO format + updated_date TEXT, -- ISO format + categories_json TEXT NOT NULL, -- JSON array of categories + pdf_path TEXT, + paper_md_path TEXT, + summary_json_path TEXT, + summary_md_path TEXT, + imported_at TEXT NOT NULL, -- ISO format + conversion_status TEXT NOT NULL, + summary_status TEXT NOT NULL, + tags_json TEXT NOT NULL, -- JSON array of tags + notes TEXT NOT NULL, + + -- Computed fields for search + search_text TEXT, -- Full-text search content + author_list TEXT, -- Space-separated authors for search + category_list TEXT -- Space-separated categories for search + ) + """) + + # Create indexes for common queries + conn.execute( + "CREATE INDEX IF NOT EXISTS idx_papers_source_type ON papers(source_type)" + ) + conn.execute( + "CREATE INDEX IF NOT EXISTS idx_papers_source_id ON papers(source_id)" + ) + conn.execute( + "CREATE INDEX IF NOT EXISTS idx_papers_conversion_status " + "ON papers(conversion_status)" + ) + conn.execute( + "CREATE INDEX IF NOT EXISTS idx_papers_summary_status " + "ON papers(summary_status)" + ) + conn.execute( + "CREATE INDEX IF NOT EXISTS idx_papers_imported_at ON papers(imported_at)" + ) + + # Full-text search virtual table + conn.execute(""" + CREATE VIRTUAL TABLE IF NOT EXISTS papers_fts USING fts5( + paper_id UNINDEXED, + title, + authors, + search_text, + categories, + tags, + notes, + content='papers', + content_rowid='rowid' + ) + """) + + def index_paper(self, metadata: PaperMetadata) -> None: + """Index a paper in the database.""" + import json + + with self._get_connection() as conn: + # Prepare data for insertion + parts = [ + metadata.title, + " ".join(metadata.authors), + " ".join(metadata.categories), + " ".join(metadata.tags), + metadata.notes, + ] + search_text = " ".join(parts) + author_list = " ".join(metadata.authors) + category_list = " ".join(metadata.categories) + + # Insert or replace in main table + conn.execute( + """ + INSERT OR REPLACE INTO papers ( + paper_id, source_type, source_id, title, authors_json, + published_date, updated_date, categories_json, pdf_path, + paper_md_path, summary_json_path, summary_md_path, + imported_at, conversion_status, summary_status, + tags_json, notes, search_text, author_list, category_list + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + metadata.paper_id, + metadata.source_type.value, + metadata.source_id, + metadata.title, + json.dumps(metadata.authors), + metadata.published_date.isoformat() + if metadata.published_date + else None, + metadata.updated_date.isoformat() + if metadata.updated_date + else None, + json.dumps(metadata.categories), + metadata.pdf_path, + metadata.paper_md_path, + metadata.summary_json_path, + metadata.summary_md_path, + metadata.imported_at.isoformat(), + metadata.conversion_status.value, + metadata.summary_status.value, + json.dumps(metadata.tags), + metadata.notes, + search_text, + author_list, + category_list, + ), + ) + + # Update FTS table + conn.execute( + """ + INSERT OR REPLACE INTO papers_fts ( + paper_id, title, authors, search_text, categories, tags, notes + ) VALUES (?, ?, ?, ?, ?, ?, ?) + """, + ( + metadata.paper_id, + metadata.title, + " ".join(metadata.authors), + search_text, + " ".join(metadata.categories), + " ".join(metadata.tags), + metadata.notes, + ), + ) + + def remove_paper(self, paper_id: str) -> bool: + """Remove a paper from the index.""" + with self._get_connection() as conn: + cursor = conn.execute("DELETE FROM papers WHERE paper_id = ?", (paper_id,)) + conn.execute("DELETE FROM papers_fts WHERE paper_id = ?", (paper_id,)) + return cursor.rowcount > 0 + + def get_paper(self, paper_id: str) -> dict | None: + """Get a paper by ID from the index.""" + with self._get_connection() as conn: + cursor = conn.execute( + "SELECT * FROM papers WHERE paper_id = ?", (paper_id,) + ) + row = cursor.fetchone() + return dict(row) if row else None + + def list_papers( + self, + source_type: SourceType | None = None, + conversion_status: ConversionStatus | None = None, + summary_status: SummaryStatus | None = None, + limit: int | None = None, + offset: int = 0, + ) -> Iterator[dict]: + """List papers with optional filtering.""" + conditions = [] + params = [] + + if source_type: + conditions.append("source_type = ?") + params.append(source_type.value) + + if conversion_status: + conditions.append("conversion_status = ?") + params.append(conversion_status.value) + + if summary_status: + conditions.append("summary_status = ?") + params.append(summary_status.value) + + where_clause = "" + if conditions: + where_clause = "WHERE " + " AND ".join(conditions) + + query = f"SELECT * FROM papers {where_clause} ORDER BY imported_at DESC" + + if limit: + query += " LIMIT ? OFFSET ?" + params.extend([limit, offset]) + + with self._get_connection() as conn: + cursor = conn.execute(query, params) + for row in cursor: + yield dict(row) + + def search_papers(self, query: str, limit: int = 50) -> Iterator[dict]: + """Search papers using full-text search.""" + with self._get_connection() as conn: + # Use FTS for full-text search + cursor = conn.execute( + """ + SELECT papers.* FROM papers_fts + JOIN papers ON papers.paper_id = papers_fts.paper_id + WHERE papers_fts MATCH ? + ORDER BY rank + LIMIT ? + """, + (query, limit), + ) + + for row in cursor: + yield dict(row) + + def search_by_field( + self, + field: str, + value: str, + exact_match: bool = False, + limit: int = 50, + ) -> Iterator[dict]: + """Search papers by specific field.""" + if field not in ["title", "author_list", "category_list", "notes"]: + msg = f"Invalid field for search: {field}" + raise ValueError(msg) + + if exact_match: + where_clause = f"{field} = ?" + params = [value] + else: + where_clause = f"{field} LIKE ?" + params = [f"%{value}%"] + + query = f"SELECT * FROM papers WHERE {where_clause} ORDER BY imported_at DESC LIMIT ?" + params.append(limit) + + with self._get_connection() as conn: + cursor = conn.execute(query, params) + for row in cursor: + yield dict(row) + + def get_statistics(self) -> dict: + """Get library statistics.""" + with self._get_connection() as conn: + stats = {} + + # Total papers + cursor = conn.execute("SELECT COUNT(*) as count FROM papers") + stats["total_papers"] = cursor.fetchone()["count"] + + # By source type + cursor = conn.execute( + "SELECT source_type, COUNT(*) as count FROM papers GROUP BY source_type" + ) + stats["by_source_type"] = { + row["source_type"]: row["count"] for row in cursor + } + + # By conversion status + cursor = conn.execute( + "SELECT conversion_status, COUNT(*) as count FROM papers GROUP BY conversion_status" + ) + stats["by_conversion_status"] = { + row["conversion_status"]: row["count"] for row in cursor + } + + # By summary status + cursor = conn.execute( + "SELECT summary_status, COUNT(*) as count FROM papers GROUP BY summary_status" + ) + stats["by_summary_status"] = { + row["summary_status"]: row["count"] for row in cursor + } + + return stats + + def reindex_from_storage(self, storage_manager) -> tuple[int, int]: + """Rebuild the index from storage files.""" + success_count = 0 + error_count = 0 + + # Clear existing index + with self._get_connection() as conn: + conn.execute("DELETE FROM papers") + conn.execute("DELETE FROM papers_fts") + + # Reindex all papers from storage + for metadata in storage_manager.list_all_papers(): + try: + self.index_paper(metadata) + success_count += 1 + except Exception: + error_count += 1 + + return success_count, error_count diff --git a/src/paperlib/models/__init__.py b/src/paperlib/models/__init__.py new file mode 100644 index 0000000..8f5cec9 --- /dev/null +++ b/src/paperlib/models/__init__.py @@ -0,0 +1,17 @@ +"""Data models for paperlib.""" + +from .paper import ( + ConversionStatus, + PaperMetadata, + PaperSummary, + SourceType, + SummaryStatus, +) + +__all__ = [ + "ConversionStatus", + "PaperMetadata", + "PaperSummary", + "SourceType", + "SummaryStatus", +] diff --git a/src/paperlib/models/paper.py b/src/paperlib/models/paper.py new file mode 100644 index 0000000..344ffec --- /dev/null +++ b/src/paperlib/models/paper.py @@ -0,0 +1,164 @@ +"""Data models for paper metadata and summaries.""" + +from __future__ import annotations + +import json +from dataclasses import asdict, dataclass, field +from datetime import datetime +from enum import StrEnum +from pathlib import Path +from typing import Any + + +class ConversionStatus(StrEnum): + """Status of PDF to Markdown conversion.""" + + PENDING = "pending" + PROCESSING = "processing" + SUCCESS = "success" + FAILED = "failed" + + +class SummaryStatus(StrEnum): + """Status of AI summarization.""" + + PENDING = "pending" + PROCESSING = "processing" + SUCCESS = "success" + FAILED = "failed" + NOT_REQUESTED = "not_requested" + + +class SourceType(StrEnum): + """Type of paper source.""" + + LOCAL = "local" + ARXIV = "arxiv" + + +@dataclass +class PaperMetadata: + """Metadata for a paper (stored in meta.json).""" + + # Core identifiers + paper_id: str + source_type: SourceType + source_id: str | None = None # arXiv ID or local file hash + + # Bibliographic information + title: str = "" + authors: list[str] = field(default_factory=list) + published_date: datetime | None = None + updated_date: datetime | None = None + categories: list[str] = field(default_factory=list) + + # File paths (relative to library root) + pdf_path: str | None = None + paper_md_path: str | None = None + summary_json_path: str | None = None + summary_md_path: str | None = None + + # Processing status + imported_at: datetime = field(default_factory=datetime.now) + conversion_status: ConversionStatus = ConversionStatus.PENDING + summary_status: SummaryStatus = SummaryStatus.NOT_REQUESTED + + # Additional metadata + tags: list[str] = field(default_factory=list) + notes: str = "" + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + data = asdict(self) + # Convert datetime objects to ISO format strings + for field_name in ["published_date", "updated_date", "imported_at"]: + if data[field_name] is not None: + data[field_name] = data[field_name].isoformat() + # Convert enums to strings + data["source_type"] = self.source_type.value + data["conversion_status"] = self.conversion_status.value + data["summary_status"] = self.summary_status.value + return data + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> PaperMetadata: + """Create from dictionary (JSON deserialization).""" + # Convert ISO format strings back to datetime objects + for field_name in ["published_date", "updated_date", "imported_at"]: + if data.get(field_name): + data[field_name] = datetime.fromisoformat(data[field_name]) + # Convert strings back to enums + if "source_type" in data: + data["source_type"] = SourceType(data["source_type"]) + if "conversion_status" in data: + data["conversion_status"] = ConversionStatus(data["conversion_status"]) + if "summary_status" in data: + data["summary_status"] = SummaryStatus(data["summary_status"]) + return cls(**data) + + def save_to_file(self, file_path: Path) -> None: + """Save metadata to a JSON file atomically.""" + # Write to temporary file first, then move (atomic operation) + temp_path = file_path.with_suffix(".tmp") + with temp_path.open("w") as f: + json.dump(self.to_dict(), f, indent=2) + temp_path.rename(file_path) + + @classmethod + def load_from_file(cls, file_path: Path) -> PaperMetadata: + """Load metadata from a JSON file.""" + with file_path.open() as f: + data = json.load(f) + return cls.from_dict(data) + + +@dataclass +class PaperSummary: + """Structured summary for a paper (stored in summary.json).""" + + # Schema version for migration + schema_version: str = "1.0" + + # Core summary fields + one_sentence_summary: str = "" + problem_statement: str = "" + method_overview: str = "" + main_results: str = "" + claimed_contributions: list[str] = field(default_factory=list) + assumptions: list[str] = field(default_factory=list) + limitations: list[str] = field(default_factory=list) + + # Categorization + problem_tags: list[str] = field(default_factory=list) + technique_tags: list[str] = field(default_factory=list) + + # Entities mentioned + entities: list[str] = field(default_factory=list) + + # Relevance scoring (optional) + relevance_to_user: float | None = None + recommended_sections: list[str] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary for JSON serialization.""" + return asdict(self) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> PaperSummary: + """Create from dictionary (JSON deserialization).""" + return cls(**data) + + def save_to_file(self, file_path: Path) -> None: + """Save summary to a JSON file atomically.""" + # Write to temporary file first, then move (atomic operation) + temp_path = file_path.with_suffix(".tmp") + with temp_path.open("w") as f: + json.dump(self.to_dict(), f, indent=2) + temp_path.rename(file_path) + + @classmethod + def load_from_file(cls, file_path: Path) -> PaperSummary: + """Load summary from a JSON file.""" + with file_path.open() as f: + data = json.load(f) + return cls.from_dict(data) diff --git a/src/paperlib/storage/__init__.py b/src/paperlib/storage/__init__.py new file mode 100644 index 0000000..5d86f87 --- /dev/null +++ b/src/paperlib/storage/__init__.py @@ -0,0 +1,5 @@ +"""Storage layer for paperlib.""" + +from .manager import PaperStorageManager + +__all__ = ["PaperStorageManager"] diff --git a/src/paperlib/storage/manager.py b/src/paperlib/storage/manager.py new file mode 100644 index 0000000..da62027 --- /dev/null +++ b/src/paperlib/storage/manager.py @@ -0,0 +1,183 @@ +"""Paper storage manager for CRUD operations on metadata files.""" + +from __future__ import annotations + +import hashlib +import shutil +from collections.abc import Iterator +from datetime import datetime +from pathlib import Path + +from paperlib.config import LibraryPaths +from paperlib.models import PaperMetadata, PaperSummary, SourceType + + +class PaperStorageManager: + """Manages storage and retrieval of papers and their metadata.""" + + def __init__(self, library_paths: LibraryPaths) -> None: + self.library_paths = library_paths + + def generate_paper_id( + self, + source_type: SourceType, + source_id: str | None = None, + pdf_path: Path | None = None, + ) -> str: + """Generate a stable paper ID based on source type and content.""" + if source_type == SourceType.ARXIV and source_id: + # Use arXiv ID directly (normalized) + return f"arxiv-{source_id.replace('.', '_').replace('v', '_v')}" + elif source_type == SourceType.LOCAL and pdf_path: + # Use SHA256 hash of PDF file content + with pdf_path.open("rb") as f: + content = f.read() + hash_hex = hashlib.sha256(content).hexdigest() + return f"local-{hash_hex[:16]}" # Use first 16 chars of hash + else: + msg = "Cannot generate paper ID without proper source information" + raise ValueError(msg) + + def get_paper_directory(self, paper_id: str, source_type: SourceType) -> Path: + """Get the directory path for storing a paper's files.""" + if source_type == SourceType.ARXIV: + # Extract year from arXiv ID pattern (e.g., "2212.06340" -> "2022") + arxiv_id = paper_id.replace("arxiv-", "").replace("_", ".") + year_part = arxiv_id[:4] + # Modern arXiv format: YYMM.NNNNN + if len(year_part) == 4 and year_part.isdigit(): + year = year_part + else: + # Fallback to current year for older formats + year = str(datetime.now().year) + return self.library_paths.papers_dir / "arxiv" / year / paper_id + else: + # Local papers go under papers/local/{hash-prefix}/ + hash_part = paper_id.replace("local-", "") + return self.library_paths.papers_dir / "local" / hash_part + + def get_paper_paths( + self, paper_id: str, source_type: SourceType + ) -> dict[str, Path]: + """Get all expected file paths for a paper.""" + paper_dir = self.get_paper_directory(paper_id, source_type) + return { + "directory": paper_dir, + "meta": paper_dir / "meta.json", + "pdf": paper_dir / "source.pdf", + "markdown": paper_dir / "paper.md", + "summary_json": paper_dir / "summary.json", + "summary_md": paper_dir / "summary.md", + "assets": paper_dir / "assets", + "logs": paper_dir / "logs", + } + + def store_paper( + self, + pdf_path: Path, + source_type: SourceType, + source_id: str | None = None, + **metadata_kwargs, + ) -> PaperMetadata: + """Store a paper and create its metadata.""" + # Generate paper ID + paper_id = self.generate_paper_id(source_type, source_id, pdf_path) + + # Get storage paths + paths = self.get_paper_paths(paper_id, source_type) + + # Create directory structure + paths["directory"].mkdir(parents=True, exist_ok=True) + paths["assets"].mkdir(exist_ok=True) + paths["logs"].mkdir(exist_ok=True) + + # Copy PDF to storage + shutil.copy2(pdf_path, paths["pdf"]) + + # Create metadata + metadata = PaperMetadata( + paper_id=paper_id, + source_type=source_type, + source_id=source_id, + pdf_path=str(paths["pdf"].relative_to(self.library_paths.root)), + paper_md_path=str(paths["markdown"].relative_to(self.library_paths.root)), + summary_json_path=str( + paths["summary_json"].relative_to(self.library_paths.root) + ), + summary_md_path=str( + paths["summary_md"].relative_to(self.library_paths.root) + ), + **metadata_kwargs, + ) + + # Save metadata + metadata.save_to_file(paths["meta"]) + + return metadata + + def load_paper_metadata( + self, paper_id: str, source_type: SourceType + ) -> PaperMetadata | None: + """Load paper metadata from storage.""" + paths = self.get_paper_paths(paper_id, source_type) + if not paths["meta"].exists(): + return None + + try: + return PaperMetadata.load_from_file(paths["meta"]) + except (FileNotFoundError, ValueError): + return None + + def update_paper_metadata(self, metadata: PaperMetadata) -> None: + """Update paper metadata in storage.""" + paths = self.get_paper_paths(metadata.paper_id, metadata.source_type) + metadata.save_to_file(paths["meta"]) + + def load_paper_summary( + self, paper_id: str, source_type: SourceType + ) -> PaperSummary | None: + """Load paper summary from storage.""" + paths = self.get_paper_paths(paper_id, source_type) + if not paths["summary_json"].exists(): + return None + + try: + return PaperSummary.load_from_file(paths["summary_json"]) + except (FileNotFoundError, ValueError): + return None + + def save_paper_summary( + self, paper_id: str, source_type: SourceType, summary: PaperSummary + ) -> None: + """Save paper summary to storage.""" + paths = self.get_paper_paths(paper_id, source_type) + summary.save_to_file(paths["summary_json"]) + + def list_all_papers(self) -> Iterator[PaperMetadata]: + """Iterate over all papers in the library.""" + papers_dir = self.library_paths.papers_dir + if not papers_dir.exists(): + return + + # Look for meta.json files in the papers directory structure + for meta_file in papers_dir.rglob("meta.json"): + try: + yield PaperMetadata.load_from_file(meta_file) + except (ValueError, FileNotFoundError): + # Skip corrupted metadata files + continue + + def paper_exists(self, paper_id: str, source_type: SourceType) -> bool: + """Check if a paper already exists in storage.""" + paths = self.get_paper_paths(paper_id, source_type) + return paths["meta"].exists() + + def delete_paper(self, paper_id: str, source_type: SourceType) -> bool: + """Delete a paper and all its files.""" + paths = self.get_paper_paths(paper_id, source_type) + if not paths["directory"].exists(): + return False + + # Remove entire paper directory + shutil.rmtree(paths["directory"]) + return True diff --git a/uv.lock b/uv.lock index 69dbc58..b3f638c 100644 --- a/uv.lock +++ b/uv.lock @@ -97,6 +97,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/da/42/e921fccf5015463e32a3cf6ee7f980a6ed0f395ceeaa45060b61d86486c2/anyio-4.13.0-py3-none-any.whl", hash = "sha256:08b310f9e24a9594186fd75b4f73f4a4152069e3853f1ed8bfbf58369f4ad708", size = 114353, upload-time = "2026-03-24T12:59:08.246Z" }, ] +[[package]] +name = "arxiv" +version = "3.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "feedparser" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/78/1e93a001ed51b5114e1978247078fa3130cbb2794a520603949cbe9a7028/arxiv-3.0.0.tar.gz", hash = "sha256:c8cb0d31208afbc1ceb17bd3f9816c8d4c5ca1e0abf199d211e216715440498d", size = 67344, upload-time = "2026-04-12T22:48:59.623Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/0d/bb2ef604e5548ba73ba6326576908d8285ebf3468b02b86af83381c7c973/arxiv-3.0.0-py3-none-any.whl", hash = "sha256:8b4d4e2e336bfeb71ea653623d7dadb260f682f0475cee2aecad0560a23b34db", size = 11928, upload-time = "2026-04-12T22:48:58.44Z" }, +] + [[package]] name = "audioop-lts" version = "0.2.2" @@ -502,6 +515,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/31/fb/6d251f3fdfe3346ee60d091f55106513e509659ee005ad39c914182c96f4/fasttext_predict-0.9.2.4-cp313-cp313t-win_amd64.whl", hash = "sha256:be0933fa4af7abae09c703d28f9e17c80e7069eb6f92100b21985b777f4ea275", size = 110325, upload-time = "2024-11-23T17:24:16.984Z" }, ] +[[package]] +name = "feedparser" +version = "6.0.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sgmllib3k" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dc/79/db7edb5e77d6dfbc54d7d9df72828be4318275b2e580549ff45a962f6461/feedparser-6.0.12.tar.gz", hash = "sha256:64f76ce90ae3e8ef5d1ede0f8d3b50ce26bcce71dd8ae5e82b1cd2d4a5f94228", size = 286579, upload-time = "2025-09-10T13:33:59.486Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/eb/c96d64137e29ae17d83ad2552470bafe3a7a915e85434d9942077d7fd011/feedparser-6.0.12-py3-none-any.whl", hash = "sha256:6bbff10f5a52662c00a2e3f86a38928c37c48f77b3c511aedcd51de933549324", size = 81480, upload-time = "2025-09-10T13:33:58.022Z" }, +] + [[package]] name = "ffmpy" version = "1.0.0" @@ -1382,6 +1407,7 @@ name = "paperlib" version = "0.1.0" source = { editable = "." } dependencies = [ + { name = "arxiv" }, { name = "mineru", extra = ["core"] }, { name = "rich" }, { name = "typer" }, @@ -1389,6 +1415,7 @@ dependencies = [ [package.metadata] requires-dist = [ + { name = "arxiv", specifier = ">=2.0.0" }, { name = "mineru", extras = ["core"], specifier = ">=3.0.9" }, { name = "rich", specifier = ">=15.0.0" }, { name = "typer", specifier = ">=0.24.1" }, @@ -1947,6 +1974,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e1/e3/c164c88b2e5ce7b24d667b9bd83589cf4f3520d97cad01534cd3c4f55fdb/setuptools-81.0.0-py3-none-any.whl", hash = "sha256:fdd925d5c5d9f62e4b74b30d6dd7828ce236fd6ed998a08d81de62ce5a6310d6", size = 1062021, upload-time = "2026-02-06T21:10:37.175Z" }, ] +[[package]] +name = "sgmllib3k" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9", size = 5750, upload-time = "2010-08-24T14:33:52.445Z" } + [[package]] name = "shapely" version = "2.1.2"