diff --git a/README.md b/README.md index f90ba59..a362659 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,213 @@ -# `paperlib`: a CLI tool to manage paper library +# paperlib -This project use `mineru` to convert PDF to markdown, and establish a markdown paper library. +A local-first paper library engine with a CLI for managing academic papers. -## usage +**paperlib** is designed to import PDF papers into a structured local library, convert PDFs into Markdown using external converters, maintain stable per-paper metadata files, and provide a searchable index database. It offers optional AI-based structured summaries while remaining useful even without AI features. + +## Key Features + +- **Local-first**: All data lives locally in the paper library directory +- **CLI-first**: All important workflows accessible from the command line +- **JSON source of truth**: Per-paper metadata files with rebuildable SQLite index +- **AI-optional**: Core workflows work without LLM configuration +- **Machine-readable**: `--json` output for automation and integration +- **Stable interfaces**: Designed for scripts and higher-level tools + +## Installation ```bash -# init a library in current directory +# Install with uv (recommended) +uv add paperlib + +# Or with pip +pip install paperlib +``` + +## Quick Start + +```bash +# Initialize a paper library paperlib init -# manually import a PDF -paperlib import --pdf [--arxiv-id xxxx.xxxxx] +# Import a local PDF +paperlib import --pdf paper.pdf --title "My Research Paper" -# import an arXiv paper -paperlib import --arxiv xxxx.xxxxx +# Import from arXiv +paperlib import --arxiv 2212.06340 -# place holder -... +# List all papers +paperlib list + +# Show paper details +paperlib show + +# Convert PDFs to Markdown (requires MinerU) +paperlib convert + +# Search papers +paperlib search "machine learning" + +# Rebuild search index +paperlib reindex ``` + +## Core Commands + +### Library Management +- `paperlib init [path]` - Initialize a paper library directory +- `paperlib status` - Show library configuration and layout +- `paperlib reindex` - Rebuild search index from stored papers + +### Paper Import +- `paperlib import --pdf ` - Import a local PDF file +- `paperlib import --arxiv ` - Import paper from arXiv +- Options: `--title`, `--notes`, `--tags`, `--library` + +### Paper Management +- `paperlib list` - List all imported papers with status +- `paperlib show ` - Show detailed paper information +- `paperlib convert` - Convert pending papers to Markdown using MinerU + +### Search (Future) +- `paperlib search ` - Search papers by content and metadata + +## Library Structure + +A paperlib library is organized as follows: + +``` +library_root/ +├── config/ +│ ├── config.toml +│ └── prompts/ +├── papers/ +│ ├── arxiv/ +│ │ └── 2026/ +│ │ └── arxiv-2212_06340/ +│ │ ├── meta.json # Paper metadata +│ │ ├── source.pdf # Original PDF +│ │ ├── paper.md # Converted markdown +│ │ ├── summary.json # AI summary (optional) +│ │ ├── summary.md # Rendered summary +│ │ ├── assets/ # Images, figures +│ │ └── logs/ # Conversion logs +│ └── local/ +│ └── / +│ └── ... +├── db/ +│ └── paperlib.sqlite3 # Search index (rebuildable) +├── inbox/ # Temporary imports +└── cache/ # Processing cache +``` + +## Data Model + +### Paper Metadata (`meta.json`) +Each paper has a `meta.json` file containing: +- Core identifiers: `paper_id`, `source_type`, `source_id` +- Bibliographic info: `title`, `authors`, `published_date`, `categories` +- File paths: `pdf_path`, `paper_md_path`, `summary_json_path` +- Processing status: `conversion_status`, `summary_status` +- User data: `tags`, `notes` + +### Summary Data (`summary.json`) +Optional AI-generated summaries with: +- Structured fields: problem statement, method overview, results +- Categorization: problem tags, technique tags +- Relevance scoring and recommended sections + +## PDF Conversion + +paperlib integrates with [MinerU](https://github.com/opendatalab/MinerU) for high-quality PDF to Markdown conversion: + +```bash +# Install MinerU (optional) +pip install mineru[core] + +# Convert all pending papers +paperlib convert + +# Convert specific paper +paperlib convert --paper-id +``` + +## Machine-Readable Output + +Most commands support `--json` output for automation: + +```bash +paperlib list --json +paperlib show --json +paperlib status --json +``` + +## Development + +paperlib is designed for extensibility and integration with higher-level tools. + +### Running Tests + +```bash +# Run all tests +uv run pytest + +# Run specific test module +uv run pytest tests/test_models.py + +# Run with coverage +uv run pytest --cov=paperlib +``` + +### Code Quality + +```bash +# Format code +uv run ruff format + +# Check linting +uv run ruff check + +# Type checking +uv run mypy src/ +``` + +## Architecture + +paperlib follows clean architecture principles: + +- **Models**: Data structures for papers and summaries +- **Storage**: File-based metadata and PDF management +- **Index**: SQLite search and retrieval layer +- **Importers**: PDF and arXiv import workflows +- **Converters**: PDF to Markdown transformation +- **CLI**: Command-line interface and argument parsing + +## Roadmap + +- [x] Core paper import (local PDF, arXiv) +- [x] PDF to Markdown conversion (MinerU integration) +- [x] Metadata management and search indexing +- [x] CLI with all basic commands +- [x] Comprehensive test suite +- [ ] Search command implementation +- [ ] AI summarization with provider abstraction +- [ ] JSON output for all commands +- [ ] Configuration file support +- [ ] Advanced arXiv workflows + +## Non-Goals + +paperlib is intentionally focused and does NOT include: +- Web UI or GUI applications +- Multi-user or cloud-first features +- Mandatory daemon or background services +- Vector database requirements +- Fully autonomous research assistant behavior + +## License + +MIT License - see LICENSE file for details. + +## Contributing + +Contributions welcome! Please read the development guidelines in AGENTS.md and ensure all tests pass before submitting PRs. \ No newline at end of file diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 0000000..76e9ab3 --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,288 @@ +# CLI Reference + +This document describes all available commands in the paperlib CLI. + +## Global Options + +All commands support these global options: + +- `--help`, `-h`: Show help message +- `--version`: Show version information + +Many commands also support: +- `--library`, `-L`: Specify library root directory (default: current directory) +- `--json`: Output machine-readable JSON instead of human-readable format + +## Commands + +### `paperlib init [PATH]` + +Initialize a paper library directory structure. + +**Arguments:** +- `PATH`: Directory to initialize (default: current directory) + +**Examples:** +```bash +# Initialize library in current directory +paperlib init + +# Initialize library in specific directory +paperlib init /path/to/my/papers + +# Initialize and create parent directories +paperlib init ~/Documents/research/papers +``` + +**Behavior:** +- Creates standard directory structure (config/, papers/, db/, etc.) +- Safe to run multiple times (idempotent) +- Creates parent directories if they don't exist + +--- + +### `paperlib import` + +Import papers into the library from various sources. + +**Required (one of):** +- `--pdf PATH`: Import a local PDF file +- `--arxiv ID`: Import paper from arXiv by ID or URL + +**Options:** +- `--title TEXT`: Override paper title (for local PDFs) +- `--notes TEXT`: Add notes about the paper +- `--tags TAG1 TAG2`: Add tags to the paper +- `--library PATH`: Specify library directory + +**Examples:** +```bash +# Import local PDF +paperlib import --pdf paper.pdf --title "My Research" --tags ml ai + +# Import from arXiv +paperlib import --arxiv 2212.06340 + +# Import with arXiv URL +paperlib import --arxiv https://arxiv.org/abs/2212.06340 + +# Import to specific library +paperlib import --pdf paper.pdf --library ~/research +``` + +**Behavior:** +- Generates stable paper ID based on content (local) or arXiv ID +- Copies PDF to structured storage location +- Creates meta.json with paper metadata +- Prevents duplicate imports (same content/ID) +- Indexes paper in search database + +--- + +### `paperlib list` + +List all papers in the library with their current status. + +**Options:** +- `--library PATH`: Specify library directory +- `--json`: Output in JSON format + +**Examples:** +```bash +# List all papers +paperlib list + +# List papers in specific library +paperlib list --library ~/research + +# Get machine-readable output +paperlib list --json +``` + +**Output Format:** +``` +Found 3 papers: + +📄 arxiv-2212_06340 + The new discontinuous Galerkin methods based numerical relativity program Nmesh + By: Wolfgang Tichy, Liwei Ji, Ananya Adhikari (+2 more) + Categories: gr-qc + +⏳ local-a1b2c3d4e5f6 + Machine Learning Applications in Physics + Categories: cs.AI, physics.comp-ph +``` + +**Status Indicators:** +- ⏳ Paper imported, conversion pending +- 📄 PDF converted to Markdown +- 📝 AI summary generated +- ❌ Conversion or processing failed + +--- + +### `paperlib show PAPER_ID` + +Show detailed information about a specific paper. + +**Arguments:** +- `PAPER_ID`: The unique paper identifier + +**Options:** +- `--library PATH`: Specify library directory +- `--json`: Output in JSON format + +**Examples:** +```bash +# Show paper details +paperlib show arxiv-2212_06340 + +# Show with JSON output +paperlib show local-a1b2c3d4 --json +``` + +**Output includes:** +- All metadata fields +- Processing status +- File locations and existence +- Import timestamp +- Tags and notes + +--- + +### `paperlib convert` + +Convert papers from PDF to Markdown using MinerU. + +**Options:** +- `--library PATH`: Specify library directory +- `--paper-id ID`: Convert specific paper only + +**Examples:** +```bash +# Convert all pending papers +paperlib convert + +# Convert specific paper +paperlib convert --paper-id arxiv-2212_06340 + +# Convert in specific library +paperlib convert --library ~/research +``` + +**Behavior:** +- Processes papers with `conversion_status: pending` +- Uses MinerU for PDF to Markdown conversion +- Updates metadata with conversion status +- Creates conversion logs in `logs/` directory +- Handles conversion failures gracefully + +--- + +### `paperlib reindex` + +Rebuild the search index from stored paper metadata. + +**Options:** +- `--library PATH`: Specify library directory + +**Examples:** +```bash +# Rebuild index +paperlib reindex + +# Rebuild index for specific library +paperlib reindex --library ~/research +``` + +**Behavior:** +- Clears existing SQLite database +- Scans all meta.json files in papers/ directory +- Rebuilds full-text search index +- Reports statistics on completion +- Safe to run anytime (repairs corrupted index) + +--- + +### `paperlib status` + +Show library configuration and layout information. + +**Options:** +- `--library PATH`: Specify library directory +- `--json`: Output in JSON format + +**Examples:** +```bash +# Show current library status +paperlib status + +# Show specific library status +paperlib status --library ~/research +``` + +**Output:** +``` +root: /home/user/papers +config: /home/user/papers/config/config.toml +database: /home/user/papers/db/paperlib.sqlite3 +papers: /home/user/papers/papers +inbox: /home/user/papers/inbox +cache: /home/user/papers/cache +``` + +--- + +## Future Commands + +These commands are planned but not yet implemented: + +### `paperlib search QUERY` +Search papers by content and metadata. + +### `paperlib summarize [PAPER_ID]` +Generate AI summaries for papers. + +### `paperlib export FORMAT` +Export papers in various formats. + +### `paperlib doctor` +Diagnose and repair library issues. + +--- + +## Exit Codes + +paperlib commands return standard exit codes: + +- `0`: Success +- `1`: General error (file not found, invalid arguments, etc.) +- `2`: Command line argument error + +## Configuration + +paperlib looks for configuration in these locations (in order): +1. `$LIBRARY_ROOT/config/config.toml` +2. `~/.config/paperlib/config.toml` +3. Built-in defaults + +## JSON Output Format + +When using `--json`, commands output structured data suitable for programmatic consumption: + +```json +{ + "papers": [ + { + "paper_id": "arxiv-2212_06340", + "title": "Example Paper", + "authors": ["Alice Smith", "Bob Jones"], + "conversion_status": "success", + "imported_at": "2024-01-15T10:30:00" + } + ], + "total": 1 +} +``` + +This format is stable across paperlib versions for reliable automation. \ No newline at end of file diff --git a/docs/integration-guide.md b/docs/integration-guide.md new file mode 100644 index 0000000..2555997 --- /dev/null +++ b/docs/integration-guide.md @@ -0,0 +1,638 @@ +# Integration Guide + +This document describes how to integrate paperlib with higher-level tools and automation workflows. + +## Overview + +paperlib is designed as a **library engine** that higher-level tools can build upon. It provides: + +- **Stable CLI interface** with machine-readable JSON output +- **File-based storage** that external tools can read directly +- **Python API** for programmatic access +- **Event hooks** for workflow integration (future) + +## CLI Integration + +### Machine-Readable Output + +Most paperlib commands support `--json` output for automation: + +```bash +# Get library statistics +paperlib status --json +{ + "library_root": "/home/user/papers", + "total_papers": 42, + "by_status": {"converted": 38, "pending": 4}, + "last_updated": "2024-01-15T10:30:00Z" +} + +# List papers with metadata +paperlib list --json +{ + "papers": [ + { + "paper_id": "arxiv-2212_06340", + "title": "Example Paper", + "authors": ["Alice Smith", "Bob Jones"], + "categories": ["cs.AI"], + "conversion_status": "success", + "summary_status": "pending", + "imported_at": "2024-01-15T10:30:00Z" + } + ], + "total": 1 +} + +# Import with JSON response +paperlib import --arxiv 2212.06340 --json +{ + "success": true, + "paper_id": "arxiv-2212_06340", + "title": "Example Paper Title", + "message": "Successfully imported arXiv paper" +} +``` + +### Exit Codes + +paperlib commands follow standard Unix exit code conventions: + +```bash +paperlib import --arxiv 2212.06340 +echo $? # 0 for success, 1 for error + +# Check if paper exists before processing +if paperlib show "$paper_id" --json >/dev/null 2>&1; then + echo "Paper exists" +else + echo "Paper not found" +fi +``` + +### Scripting Examples + +#### Daily arXiv Import + +```bash +#!/bin/bash +# daily-arxiv.sh - Import papers from daily arXiv feed + +LIBRARY="$HOME/research" +ARXIV_FEED_URL="http://export.arxiv.org/rss/cs.AI" + +# Parse RSS feed and extract arXiv IDs +curl -s "$ARXIV_FEED_URL" | \ +grep -oP 'arxiv\.org/abs/\K[0-9]{4}\.[0-9]{4,5}' | \ +while read arxiv_id; do + echo "Importing $arxiv_id..." + paperlib import --arxiv "$arxiv_id" --library "$LIBRARY" --json +done + +# Convert newly imported papers +paperlib convert --library "$LIBRARY" + +# Generate daily report +paperlib list --library "$LIBRARY" --json | \ +jq '.papers | map(select(.imported_at | startswith(now | strftime("%Y-%m-%d"))))' +``` + +#### Batch Processing + +```bash +#!/bin/bash +# batch-process.sh - Process multiple papers from a list + +LIBRARY="$HOME/research" +PAPER_LIST="papers.txt" + +while IFS= read -r pdf_path; do + if [[ -f "$pdf_path" ]]; then + echo "Importing $pdf_path..." + result=$(paperlib import --pdf "$pdf_path" --library "$LIBRARY" --json) + + if [[ $? -eq 0 ]]; then + paper_id=$(echo "$result" | jq -r '.paper_id') + echo "Successfully imported as $paper_id" + else + echo "Failed to import $pdf_path" + fi + fi +done < "$PAPER_LIST" + +# Convert all pending papers +paperlib convert --library "$LIBRARY" +``` + +## Python API + +### Direct Library Access + +```python +from paperlib.config import LibraryPaths +from paperlib.storage import PaperStorageManager +from paperlib.index import DatabaseManager +from paperlib.importer import ArxivImporter, LocalImporter + +# Initialize library components +library_paths = LibraryPaths.from_root("/path/to/library") +storage = PaperStorageManager(library_paths) +database = DatabaseManager(library_paths) +database.initialize_database() + +# Import paper programmatically +arxiv_importer = ArxivImporter(storage) +metadata = arxiv_importer.import_arxiv_paper("2212.06340") +database.index_paper(metadata) + +# Search and retrieve +results = list(database.search_papers("neural networks")) +for result in results: + paper = storage.load_paper_metadata(result["paper_id"], result["source_type"]) + print(f"{paper.title} by {', '.join(paper.authors)}") + +# Get statistics +stats = database.get_statistics() +print(f"Total papers: {stats['total_papers']}") +``` + +### Metadata Processing + +```python +import json +from pathlib import Path +from paperlib.models import PaperMetadata, PaperSummary + +# Process all papers in library +papers_dir = Path("/home/user/papers/papers") + +for meta_file in papers_dir.rglob("meta.json"): + # Load metadata + metadata = PaperMetadata.load_from_file(meta_file) + + # Check for summary + summary_path = meta_file.parent / "summary.json" + if summary_path.exists(): + summary = PaperSummary.load_from_file(summary_path) + + # Extract key information + tags = summary.problem_tags + summary.technique_tags + entities = summary.entities + + print(f"Paper: {metadata.title}") + print(f"Tags: {', '.join(tags)}") + print(f"Entities: {', '.join(entities)}") +``` + +## File System Integration + +### Direct File Access + +Since paperlib uses a documented file layout, tools can read data directly: + +```python +import json +from pathlib import Path + +def scan_library(library_root: Path): + """Scan library and extract metadata.""" + papers = [] + + for meta_file in library_root.glob("papers/**/meta.json"): + with meta_file.open() as f: + metadata = json.load(f) + papers.append(metadata) + + return papers + +def find_papers_by_category(library_root: Path, category: str): + """Find papers in a specific category.""" + matching_papers = [] + + for meta_file in library_root.glob("papers/**/meta.json"): + with meta_file.open() as f: + metadata = json.load(f) + + if category in metadata.get("categories", []): + matching_papers.append(metadata) + + return matching_papers +``` + +### Watch for Changes + +```python +import time +from pathlib import Path +from watchdog.observers import Observer +from watchdog.events import FileSystemEventHandler + +class PaperLibraryHandler(FileSystemEventHandler): + def __init__(self, library_root): + self.library_root = Path(library_root) + + def on_created(self, event): + if event.src_path.endswith("meta.json"): + print(f"New paper imported: {event.src_path}") + # Trigger processing workflow + self.process_new_paper(event.src_path) + + def on_modified(self, event): + if event.src_path.endswith("summary.json"): + print(f"Summary updated: {event.src_path}") + # Update downstream systems + + def process_new_paper(self, meta_path): + """Handle newly imported paper.""" + # Load metadata + with open(meta_path) as f: + metadata = json.load(f) + + # Trigger downstream processing + # - Send to processing queue + # - Update knowledge base + # - Generate notifications + +# Watch library for changes +observer = Observer() +handler = PaperLibraryHandler("/home/user/papers") +observer.schedule(handler, "/home/user/papers/papers", recursive=True) +observer.start() +``` + +## Higher-Level Tool Examples + +### Research Dashboard + +```python +"""research_dashboard.py - Web dashboard for research library""" + +from flask import Flask, jsonify, render_template +from paperlib.config import LibraryPaths +from paperlib.storage import PaperStorageManager +from paperlib.index import DatabaseManager + +app = Flask(__name__) + +# Initialize paperlib components +library_paths = LibraryPaths.from_root("/home/user/research") +storage = PaperStorageManager(library_paths) +database = DatabaseManager(library_paths) + +@app.route('/api/papers') +def list_papers(): + """List all papers with metadata.""" + papers = list(database.list_papers(limit=50)) + return jsonify(papers) + +@app.route('/api/search/') +def search_papers(query): + """Search papers by query.""" + results = list(database.search_papers(query, limit=20)) + return jsonify(results) + +@app.route('/api/stats') +def library_stats(): + """Get library statistics.""" + stats = database.get_statistics() + return jsonify(stats) + +@app.route('/') +def dashboard(): + """Main dashboard page.""" + return render_template('dashboard.html') + +if __name__ == '__main__': + app.run(debug=True) +``` + +### Daily Digest Generator + +```python +"""daily_digest.py - Generate daily research digest""" + +import json +from datetime import datetime, timedelta +from pathlib import Path +from paperlib.config import LibraryPaths +from paperlib.index import DatabaseManager + +def generate_daily_digest(library_root: str, output_file: str): + """Generate digest of recently imported papers.""" + + # Initialize database + library_paths = LibraryPaths.from_root(library_root) + database = DatabaseManager(library_paths) + + # Get papers from last 24 hours + yesterday = datetime.now() - timedelta(days=1) + yesterday_iso = yesterday.isoformat() + + recent_papers = [] + for paper in database.list_papers(): + if paper["imported_at"] >= yesterday_iso: + recent_papers.append(paper) + + if not recent_papers: + print("No new papers imported yesterday.") + return + + # Group by category + by_category = {} + for paper in recent_papers: + categories = json.loads(paper["categories_json"]) + for category in categories: + if category not in by_category: + by_category[category] = [] + by_category[category].append(paper) + + # Generate HTML digest + html_content = f""" + + Daily Research Digest - {datetime.now().strftime('%Y-%m-%d')} + +

Daily Research Digest

+

Found {len(recent_papers)} new papers

+ """ + + for category, papers in by_category.items(): + html_content += f"

{category}

    " + for paper in papers: + title = paper["title"] + paper_id = paper["paper_id"] + html_content += f'
  • {title} ({paper_id})
  • ' + html_content += "
" + + html_content += "" + + # Write output + Path(output_file).write_text(html_content) + print(f"Digest written to {output_file}") + +if __name__ == "__main__": + generate_daily_digest("/home/user/research", "digest.html") +``` + +### Literature Review Assistant + +```python +"""review_assistant.py - AI-powered literature review helper""" + +from paperlib.config import LibraryPaths +from paperlib.index import DatabaseManager +from paperlib.models import PaperSummary + +class ReviewAssistant: + def __init__(self, library_root: str): + self.library_paths = LibraryPaths.from_root(library_root) + self.database = DatabaseManager(self.library_paths) + + def find_related_papers(self, paper_id: str, max_results: int = 10): + """Find papers related to the given paper.""" + + # Get source paper metadata + source_paper = self.database.get_paper(paper_id) + if not source_paper: + return [] + + # Extract search terms from title and categories + title_words = source_paper["title"].lower().split() + categories = json.loads(source_paper["categories_json"]) + + # Search for papers with similar keywords + search_terms = title_words + categories + related_papers = [] + + for term in search_terms: + results = list(self.database.search_papers(term, limit=5)) + for result in results: + if result["paper_id"] != paper_id: + related_papers.append(result) + + # Remove duplicates and return top results + seen_ids = set() + unique_papers = [] + for paper in related_papers: + if paper["paper_id"] not in seen_ids: + seen_ids.add(paper["paper_id"]) + unique_papers.append(paper) + if len(unique_papers) >= max_results: + break + + return unique_papers + + def generate_topic_overview(self, topic: str): + """Generate overview of papers on a specific topic.""" + + # Search for papers on topic + papers = list(self.database.search_papers(topic, limit=50)) + + if not papers: + return f"No papers found for topic: {topic}" + + # Analyze summaries if available + key_entities = set() + techniques = set() + + for paper in papers: + summary_path = Path(paper["summary_json_path"]) + if summary_path.exists(): + summary = PaperSummary.load_from_file(summary_path) + key_entities.update(summary.entities) + techniques.update(summary.technique_tags) + + # Generate overview + overview = f""" + Topic: {topic} + + Papers found: {len(papers)} + + Key entities mentioned: + {', '.join(sorted(key_entities)[:10])} + + Common techniques: + {', '.join(sorted(techniques)[:10])} + + Recent papers: + """ + + # Add recent papers + recent_papers = sorted(papers, key=lambda x: x["imported_at"], reverse=True)[:5] + for paper in recent_papers: + overview += f"\n- {paper['title']} ({paper['paper_id']})" + + return overview + +# Usage +assistant = ReviewAssistant("/home/user/research") +overview = assistant.generate_topic_overview("transformer architecture") +print(overview) +``` + +## Integration Patterns + +### Pipeline Processing + +```bash +# Multi-stage processing pipeline +paperlib import --arxiv 2212.06340 --json > import_result.json +paper_id=$(jq -r '.paper_id' import_result.json) + +# Convert to markdown +paperlib convert --paper-id "$paper_id" + +# Generate summary (when available) +# paperlib summarize --paper-id "$paper_id" + +# Update downstream systems +curl -X POST "http://research-db/api/papers" \ + -H "Content-Type: application/json" \ + -d @import_result.json +``` + +### Event-Driven Architecture + +```python +"""event_handler.py - Process paperlib events""" + +import json +from pathlib import Path +import pika # RabbitMQ client + +class PaperLibraryEventHandler: + def __init__(self, rabbitmq_url: str): + self.connection = pika.BlockingConnection(pika.URLParameters(rabbitmq_url)) + self.channel = self.connection.channel() + + def on_paper_imported(self, paper_metadata: dict): + """Handle new paper import.""" + message = { + "event": "paper_imported", + "paper_id": paper_metadata["paper_id"], + "title": paper_metadata["title"], + "categories": paper_metadata["categories"], + "timestamp": paper_metadata["imported_at"] + } + + # Send to processing queue + self.channel.basic_publish( + exchange='', + routing_key='paper_processing', + body=json.dumps(message) + ) + + def on_summary_generated(self, paper_id: str, summary_path: Path): + """Handle summary generation.""" + with summary_path.open() as f: + summary = json.load(f) + + message = { + "event": "summary_generated", + "paper_id": paper_id, + "tags": summary["problem_tags"] + summary["technique_tags"], + "entities": summary["entities"] + } + + # Send to indexing service + self.channel.basic_publish( + exchange='', + routing_key='summary_indexing', + body=json.dumps(message) + ) +``` + +## Best Practices + +### Error Handling + +```python +import subprocess +import json + +def safe_paperlib_command(command: list[str]) -> dict: + """Execute paperlib command with proper error handling.""" + try: + result = subprocess.run( + ["paperlib"] + command + ["--json"], + capture_output=True, + text=True, + check=True + ) + return json.loads(result.stdout) + + except subprocess.CalledProcessError as e: + return { + "success": False, + "error": e.stderr, + "exit_code": e.returncode + } + + except json.JSONDecodeError as e: + return { + "success": False, + "error": f"Invalid JSON response: {e}", + "raw_output": result.stdout + } + +# Usage +result = safe_paperlib_command(["import", "--arxiv", "2212.06340"]) +if result.get("success", True): # Assume success if no "success" field + print(f"Imported paper: {result['paper_id']}") +else: + print(f"Import failed: {result['error']}") +``` + +### Performance Optimization + +```python +# Batch operations for better performance +from paperlib.index import DatabaseManager + +def batch_index_papers(library_root: str, paper_ids: list[str]): + """Index multiple papers efficiently.""" + database = DatabaseManager(LibraryPaths.from_root(library_root)) + storage = PaperStorageManager(LibraryPaths.from_root(library_root)) + + # Begin transaction for batch insert + with database._get_connection() as conn: + for paper_id in paper_ids: + metadata = storage.load_paper_metadata(paper_id, source_type) + if metadata: + database.index_paper(metadata) + # Automatic commit on context exit +``` + +### Configuration Management + +```python +# config_manager.py - Centralized configuration +import os +from pathlib import Path + +class ConfigManager: + def __init__(self): + self.library_root = os.getenv("PAPERLIB_ROOT", Path.home() / "research") + self.api_keys = { + "openai": os.getenv("OPENAI_API_KEY"), + "anthropic": os.getenv("ANTHROPIC_API_KEY") + } + + def get_library_path(self, name: str = "default") -> str: + """Get library path by name.""" + if name == "default": + return str(self.library_root) + return str(Path.home() / f"research-{name}") + + def paperlib_command_base(self, library_name: str = "default") -> list[str]: + """Get base command for paperlib with library.""" + return ["paperlib", "--library", self.get_library_path(library_name)] + +config = ConfigManager() + +# Usage in scripts +import subprocess +cmd = config.paperlib_command_base("arxiv") + ["list", "--json"] +result = subprocess.run(cmd, capture_output=True, text=True) +``` + +This integration guide provides the foundation for building sophisticated research workflows on top of paperlib's stable, local-first architecture. \ No newline at end of file diff --git a/docs/storage-layout.md b/docs/storage-layout.md new file mode 100644 index 0000000..eaeb024 --- /dev/null +++ b/docs/storage-layout.md @@ -0,0 +1,259 @@ +# Storage Layout + +This document describes the on-disk structure and organization of a paperlib library. + +## Overview + +A paperlib library is a directory containing all papers, metadata, configuration, and index data. The layout is designed to be: + +- **Human-readable**: Directory structure is intuitive and browsable +- **Stable**: File locations don't change unexpectedly +- **Rebuildable**: Index can be reconstructed from source files +- **Portable**: Entire library can be moved or backed up as a unit + +## Directory Structure + +``` +library_root/ +├── config/ # Library configuration +│ ├── config.toml # Main configuration file +│ ├── vocab.yaml # Controlled vocabulary (future) +│ └── prompts/ # AI prompt templates (future) +│ └── summarize_paper.md +├── papers/ # Paper storage (source of truth) +│ ├── arxiv/ # arXiv papers organized by year +│ │ └── 2026/ +│ │ └── arxiv-2212_06340/ +│ │ ├── meta.json # Paper metadata +│ │ ├── source.pdf # Original PDF +│ │ ├── paper.md # Converted markdown +│ │ ├── summary.json # AI-generated summary +│ │ ├── summary.md # Rendered summary +│ │ ├── ref.bib # Bibliography (future) +│ │ ├── assets/ # Images, figures +│ │ └── logs/ # Processing logs +│ │ └── mineru.log +│ └── local/ # Local PDF imports by hash +│ └── a1b2c3d4e5f6/ +│ └── ... (same structure) +├── inbox/ # Temporary import staging (future) +├── db/ # Search index (rebuildable) +│ └── paperlib.sqlite3 +└── cache/ # Processing cache (safe to delete) +``` + +## Paper Directory Organization + +### arXiv Papers + +arXiv papers are organized by year and paper ID: + +``` +papers/arxiv/YEAR/arxiv-NORMALIZED_ID/ +``` + +Where: +- `YEAR` is extracted from the arXiv ID (e.g., `2212.06340` → `2022`) +- `NORMALIZED_ID` replaces dots and version numbers with underscores + - `2212.06340` → `arxiv-2212_06340` + - `2212.06340v2` → `arxiv-2212_06340v2` + +**Examples:** +``` +papers/arxiv/2022/arxiv-2212_06340/ +papers/arxiv/2023/arxiv-2301_12345v1/ +papers/arxiv/2024/arxiv-2405_98765/ +``` + +### Local Papers + +Local papers are organized by content hash: + +``` +papers/local/HASH_PREFIX/ +``` + +Where `HASH_PREFIX` is the first 16 characters of the SHA256 hash of the PDF file. + +**Examples:** +``` +papers/local/a1b2c3d4e5f67890/ +papers/local/fedcba9876543210/ +``` + +## File Types + +### Required Files + +Every paper directory contains: + +#### `meta.json` +The canonical metadata file (JSON format): +```json +{ + "paper_id": "arxiv-2212_06340", + "source_type": "arxiv", + "source_id": "2212.06340", + "title": "Example Paper Title", + "authors": ["Alice Smith", "Bob Jones"], + "published_date": "2022-12-13T02:46:55", + "categories": ["cs.AI", "stat.ML"], + "pdf_path": "papers/arxiv/2022/arxiv-2212_06340/source.pdf", + "paper_md_path": "papers/arxiv/2022/arxiv-2212_06340/paper.md", + "imported_at": "2024-01-15T10:30:00", + "conversion_status": "success", + "summary_status": "not_requested", + "tags": ["machine-learning"], + "notes": "Important paper on neural networks" +} +``` + +#### `source.pdf` +The original PDF file, exactly as imported. + +### Generated Files + +These files are created by paperlib processing: + +#### `paper.md` +Markdown conversion of the PDF, generated by MinerU or other converters. + +#### `summary.json` (optional) +AI-generated structured summary: +```json +{ + "schema_version": "1.0", + "one_sentence_summary": "This paper introduces...", + "problem_statement": "Current methods have limitations...", + "method_overview": "We propose a novel approach...", + "main_results": "Experiments show 95% accuracy...", + "claimed_contributions": ["Novel architecture", "Improved performance"], + "problem_tags": ["classification", "optimization"], + "technique_tags": ["neural-networks", "transformers"], + "entities": ["BERT", "ImageNet", "ResNet"], + "relevance_to_user": 0.85 +} +``` + +#### `summary.md` (optional) +Human-readable summary rendered from `summary.json`. + +### Supporting Directories + +#### `assets/` +Contains extracted images, figures, and other media from the PDF conversion process. + +#### `logs/` +Processing logs for debugging and audit trails: +- `mineru.log` - PDF conversion logs +- `summary.log` - AI summarization logs (future) + +## Index Database + +The SQLite database at `db/paperlib.sqlite3` contains: + +### Tables + +#### `papers` +Main paper index with searchable fields: +- Metadata from all `meta.json` files +- Computed search fields (full-text, author lists, etc.) +- Processing status tracking + +#### `papers_fts` +Full-text search virtual table (SQLite FTS5) for content search. + +### Rebuilding + +The database is **always rebuildable** from the source files: +```bash +paperlib reindex +``` + +This design ensures the JSON files remain the authoritative source of truth. + +## Path Conventions + +### Relative Paths +All paths in `meta.json` are relative to the library root: +```json +{ + "pdf_path": "papers/local/a1b2c3d4e5f6/source.pdf", + "paper_md_path": "papers/local/a1b2c3d4e5f6/paper.md" +} +``` + +### Cross-Platform Compatibility +All paths use forward slashes (`/`) regardless of operating system. + +## Backup and Portability + +### What to Backup +For complete library backup, include: +- `config/` directory (configuration) +- `papers/` directory (source of truth) + +### What NOT to Backup +These can be regenerated: +- `db/` directory (rebuildable index) +- `cache/` directory (temporary files) + +### Moving Libraries +To move a library: +1. Copy the entire directory structure +2. Run `paperlib reindex` to rebuild the database +3. Update any absolute paths in configuration + +## Storage Efficiency + +### Deduplication +Papers are naturally deduplicated: +- arXiv papers by normalized arXiv ID +- Local papers by SHA256 content hash + +### Large Files +For papers with large asset directories: +- Assets are stored alongside papers for locality +- Consider using file system compression or deduplication if needed + +## File System Requirements + +### Permissions +paperlib requires: +- Read/write access to library directory +- Ability to create subdirectories +- Atomic file operations for metadata updates + +### File System Features +Recommended: +- Case-sensitive file system (avoids conflicts) +- Support for Unicode filenames +- Journaling (protects against corruption) + +### Disk Space +Typical storage requirements: +- PDF files: 1-10 MB each +- Markdown conversions: 10-100 KB each +- Metadata: ~1-5 KB per paper +- Database index: ~1-10 KB per paper +- Assets: Varies (0-50 MB for image-heavy papers) + +## Migration and Versioning + +### Schema Evolution +When paperlib updates its storage format: +- Metadata schema versions are tracked in each file +- Migration tools handle format upgrades +- Backward compatibility is maintained when possible + +### Validation +paperlib provides tools to validate library integrity: +```bash +paperlib doctor # (future command) +``` + +This will check: +- All referenced files exist +- Metadata format is valid +- Database consistency with files +- No orphaned or corrupted data \ No newline at end of file diff --git a/docs/summary-schema.md b/docs/summary-schema.md new file mode 100644 index 0000000..a1e6c01 --- /dev/null +++ b/docs/summary-schema.md @@ -0,0 +1,289 @@ +# Summary Schema + +This document defines the structure and semantics of the `summary.json` files that contain AI-generated paper summaries. + +## Overview + +The `summary.json` file contains structured, AI-generated analysis of a paper. It is designed to: + +- Provide consistent, machine-readable summaries +- Support research triage and discovery workflows +- Enable automated categorization and search +- Remain stable across different AI providers +- Use controlled vocabulary when available + +## Schema Version 1.0 + +### File Structure + +```json +{ + "schema_version": "1.0", + "one_sentence_summary": "This paper introduces a novel neural architecture for...", + "problem_statement": "Current approaches to X suffer from limitations...", + "method_overview": "The authors propose a hybrid approach combining...", + "main_results": "Experiments show 15% improvement over baselines...", + "claimed_contributions": [ + "Novel attention mechanism design", + "State-of-the-art results on ImageNet", + "Theoretical analysis of convergence properties" + ], + "assumptions": [ + "Data is independently distributed", + "Computational budget allows for large models" + ], + "limitations": [ + "Only evaluated on English text", + "Requires significant computational resources", + "Limited theoretical justification for design choices" + ], + "problem_tags": ["classification", "computer-vision", "optimization"], + "technique_tags": ["neural-networks", "attention", "transformers"], + "entities": ["ImageNet", "BERT", "ResNet", "CIFAR-10"], + "relevance_to_user": 0.75, + "recommended_sections": ["Section 3.2", "Algorithm 1", "Table 2"] +} +``` + +## Field Definitions + +### Required Fields + +#### `schema_version` (string) +- **Purpose**: Track format version for migration +- **Format**: Semantic version string (e.g., "1.0") +- **Required**: Yes + +#### `one_sentence_summary` (string) +- **Purpose**: Concise paper overview for quick scanning +- **Guidelines**: + - One complete sentence, under 200 characters + - Focus on the main contribution or finding + - Avoid technical jargon when possible +- **Example**: "This paper introduces a new attention mechanism that improves transformer efficiency by 40% while maintaining accuracy." + +### Core Content Fields + +#### `problem_statement` (string) +- **Purpose**: What problem does this paper address? +- **Guidelines**: + - 2-3 sentences maximum + - Focus on the gap or limitation being addressed + - Explain why this problem matters + +#### `method_overview` (string) +- **Purpose**: High-level description of the approach +- **Guidelines**: + - 3-4 sentences maximum + - Focus on the key innovation or insight + - Avoid detailed algorithmic descriptions + +#### `main_results` (string) +- **Purpose**: Key empirical findings or theoretical results +- **Guidelines**: + - Quantitative results when available + - Highlight significance of improvements + - Note any surprising or counterintuitive findings + +### Structured Lists + +#### `claimed_contributions` (array of strings) +- **Purpose**: Authors' stated contributions +- **Guidelines**: + - Extract from paper's contribution list + - Preserve authors' framing and claims + - 3-6 items typically + +#### `assumptions` (array of strings) +- **Purpose**: Key assumptions underlying the work +- **Guidelines**: + - Mathematical, methodological, or data assumptions + - Critical for understanding applicability + - Often unstated but important + +#### `limitations` (array of strings) +- **Purpose**: Acknowledged or apparent limitations +- **Guidelines**: + - From authors' discussion or limitations section + - Obvious limitations not acknowledged by authors + - Important for understanding scope + +### Categorization + +#### `problem_tags` (array of strings) +- **Purpose**: Categorize the problem domain +- **Controlled vocabulary** (preferred values): + - `classification`, `regression`, `clustering` + - `optimization`, `search`, `planning` + - `generation`, `translation`, `summarization` + - `detection`, `segmentation`, `tracking` + - `compression`, `encoding`, `decoding` + - `privacy`, `security`, `robustness` + - `interpretability`, `fairness`, `ethics` + - `efficiency`, `scalability`, `deployment` + +#### `technique_tags` (array of strings) +- **Purpose**: Categorize the technical approaches +- **Controlled vocabulary** (preferred values): + - `neural-networks`, `deep-learning`, `transformers` + - `cnn`, `rnn`, `lstm`, `gru`, `attention` + - `reinforcement-learning`, `supervised-learning`, `unsupervised-learning` + - `bayesian`, `probabilistic`, `statistical` + - `graph-neural-networks`, `graph-algorithms` + - `computer-vision`, `natural-language-processing` + - `federated-learning`, `transfer-learning`, `meta-learning` + - `adversarial`, `generative-models`, `vae`, `gan` + +### Entities and References + +#### `entities` (array of strings) +- **Purpose**: Important datasets, models, algorithms, or systems mentioned +- **Guidelines**: + - Proper names: "ImageNet", "BERT", "ResNet" + - Algorithms: "SGD", "Adam", "RANSAC" + - Benchmarks: "GLUE", "COCO", "WMT" + - Avoid generic terms like "neural network" + +### User Relevance + +#### `relevance_to_user` (number, optional) +- **Purpose**: Estimated relevance score for the user +- **Format**: Float between 0.0 and 1.0 +- **Guidelines**: + - Based on user's research interests (if known) + - `null` if user preferences unavailable + - Higher scores = more relevant + +#### `recommended_sections` (array of strings, optional) +- **Purpose**: Specific sections worth reading in detail +- **Format**: Section references as they appear in paper +- **Examples**: ["Section 3.2", "Algorithm 1", "Table 2", "Appendix A"] + +## Generation Guidelines + +### AI Provider Instructions + +When generating summaries, AI models should: + +1. **Read for understanding**: Focus on the paper's core contributions +2. **Use structured thinking**: Work through each field systematically +3. **Prefer facts over interpretation**: Extract what authors claim, not opinions +4. **Use controlled vocabulary**: Select from predefined tag lists when possible +5. **Be concise**: Optimize for quick scanning and search +6. **Handle uncertainty**: Use `null` or empty arrays for unclear fields + +### Quality Criteria + +Good summaries exhibit: +- **Accuracy**: Faithful to the paper's content +- **Completeness**: Cover all major aspects +- **Consistency**: Similar papers get similar treatment +- **Searchability**: Use terms that aid discovery +- **Brevity**: Information density over verbosity + +### Common Issues to Avoid + +- **Hallucination**: Never invent facts not in the paper +- **Editorializing**: Don't add opinions about paper quality +- **Inconsistent terminology**: Use standard field names +- **Over-abstraction**: Keep concrete details when useful +- **Under-specification**: Provide enough detail for usefulness + +## Schema Evolution + +### Version History + +- **v1.0** (current): Initial schema with core fields + +### Migration Strategy + +When the schema evolves: +1. New versions increment the `schema_version` field +2. Migration tools handle format upgrades automatically +3. Backward compatibility maintained when possible +4. Deprecated fields are marked but preserved + +### Extensibility + +Future versions may add: +- Additional structured fields +- Hierarchical tag taxonomies +- Multi-lingual support +- Citation relationship mapping +- Experimental reproducibility metadata + +## Integration with paperlib + +### File Lifecycle + +1. **Generation**: AI provider creates `summary.json` +2. **Validation**: paperlib validates against schema +3. **Indexing**: Content indexed for search +4. **Rendering**: Human-readable `summary.md` generated +5. **Updates**: Summaries can be regenerated with new models + +### Search Integration + +Summary fields are indexed for search: +- Full-text search includes all text fields +- Tag-based search uses `problem_tags` and `technique_tags` +- Entity search uses the `entities` field +- Relevance ranking can use `relevance_to_user` scores + +### API Integration + +Higher-level tools can consume summaries programmatically: + +```python +import json +from pathlib import Path + +# Load summary +summary_path = Path("papers/arxiv/2022/arxiv-2212_06340/summary.json") +with summary_path.open() as f: + summary = json.load(f) + +# Extract key information +tags = summary["problem_tags"] + summary["technique_tags"] +relevance = summary.get("relevance_to_user", 0.0) +entities = summary["entities"] +``` + +This enables automated workflows like: +- Daily digest generation +- Research recommendation systems +- Literature review automation +- Cross-reference discovery + +## Examples + +### Machine Learning Paper +```json +{ + "schema_version": "1.0", + "one_sentence_summary": "Introduces EfficientNet, a family of convolutional neural networks that achieve better accuracy and efficiency than previous models through compound scaling.", + "problem_statement": "Existing ConvNet scaling methods arbitrarily scale network dimensions, leading to suboptimal accuracy and efficiency trade-offs.", + "method_overview": "The paper proposes compound scaling that uniformly scales network width, depth, and resolution with a fixed ratio, guided by neural architecture search to find optimal scaling coefficients.", + "main_results": "EfficientNet-B7 achieves 84.3% top-1 accuracy on ImageNet while being 8.4x smaller and 6.1x faster than the best existing ConvNet.", + "claimed_contributions": [ + "Novel compound scaling method for ConvNets", + "EfficientNet family with state-of-the-art accuracy/efficiency", + "Systematic study of scaling dimensions" + ], + "assumptions": [ + "ImageNet classification transfers to other vision tasks", + "Compound scaling works across different architectures" + ], + "limitations": [ + "Limited evaluation on tasks beyond image classification", + "Scaling coefficients may not generalize to all architectures" + ], + "problem_tags": ["classification", "computer-vision", "efficiency"], + "technique_tags": ["cnn", "neural-architecture-search", "model-scaling"], + "entities": ["ImageNet", "MobileNet", "ResNet", "NASNet"], + "relevance_to_user": null, + "recommended_sections": ["Section 3.1", "Table 2", "Figure 2"] +} +``` + +This schema provides a foundation for consistent, structured paper analysis while remaining flexible enough to evolve with new research needs and AI capabilities. \ No newline at end of file diff --git a/tests/test_cli.py b/tests/test_cli.py index d2641b3..4d7c530 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3,7 +3,6 @@ import shutil import subprocess from pathlib import Path -from unittest.mock import patch import pytest diff --git a/tests/test_importer.py b/tests/test_importer.py index 66540df..972c7cf 100644 --- a/tests/test_importer.py +++ b/tests/test_importer.py @@ -106,7 +106,7 @@ class TestLocalImporter: def test_import_duplicate_pdf(self, local_importer, sample_pdf): """Test importing the same PDF twice.""" # Import once - metadata1 = local_importer.import_pdf(pdf_path=sample_pdf) + local_importer.import_pdf(pdf_path=sample_pdf) # Try to import again with pytest.raises(ValueError, match="Paper already imported"): diff --git a/tests/test_integration.py b/tests/test_integration.py index d12bec4..8c436c6 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -6,10 +6,9 @@ from pathlib import Path import pytest from paperlib.config import LibraryPaths -from paperlib.converter import MinerUConverter -from paperlib.importer import ArxivImporter, LocalImporter +from paperlib.importer import LocalImporter from paperlib.index import DatabaseManager -from paperlib.models import ConversionStatus, SourceType +from paperlib.models import SourceType from paperlib.storage import PaperStorageManager diff --git a/tests/test_models.py b/tests/test_models.py index 18744e2..e996a15 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -5,8 +5,6 @@ import tempfile from datetime import datetime from pathlib import Path -import pytest - from paperlib.models import ( ConversionStatus, PaperMetadata, diff --git a/tests/test_storage.py b/tests/test_storage.py index 2ca1664..647534e 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -1,13 +1,12 @@ """Tests for paperlib storage manager.""" import shutil -import tempfile from pathlib import Path import pytest from paperlib.config import LibraryPaths -from paperlib.models import ConversionStatus, PaperMetadata, SourceType +from paperlib.models import ConversionStatus, SourceType from paperlib.storage import PaperStorageManager