martinapugliese · bernomone · Jul 4, 2025 · Jul 1, 2025 · Jul 1, 2025 · Jul 2, 2025
diff --git a/README.md b/README.md
@@ -31,12 +31,12 @@ Gemini is preferred because:
 * it has a very large context window - this allows to manage the input when many papers are retrieved much more comfortably (note: it is known that )
 Claude is experimental because of rate limits and input context limitations. You may see errors for limits exceeded/too many requests etc. There are mechanisms whereby askademic retries, but let us know if something really is off, we're working on improving this.
 
-Support for other LLM families ~will~ might be coming soon.
-
 ## Important note about the paper-reading feature
 
 The agent reads the paper via pulling the whole text from arXiv, there isn't an API endpoint giving whole text so this is done via the site. This feature is meant to be used lightly, that is, not reading too many papers at short turnarounds, because this would violates arXiv's terms of use.
 
+To minimize API load, Askademic includes a caching system that saves previously fetched papers for 7 days. This helps reduce the number of requests to arXiv and improves response times for papers you've already accessed.
+
 # Requirements
 
 Works with Python 3.11 and above.

diff --git a/src/askademic/article.py b/src/askademic/article.py
@@ -61,9 +61,10 @@ class ArticleRetrievalResponse(BaseModel):
 
 
 class ArticleAgent:
-    def __init__(self, model: Model, model_settings: ModelSettings = None):
+    def __init__(self, model: Model, model_settings: ModelSettings = None, use_cache: bool = True):
 
         self._get_article = get_article
+        self.use_cache = use_cache
         self._search_articles_by_title = search_articles_by_title
 
         self._article_request_discriminator_agent = Agent(
@@ -128,7 +129,7 @@ async def _answer_question(
         Returns:
             ArticleResponse: the response with the article content
         """
-        article = self._get_article(article_link)
+        article = self._get_article(article_link, use_cache=self.use_cache)
         return await self._article_agent.run(
             USER_PROMPT_ARTICLE_TEMPLATE.format(request=request, article=article)
         )

diff --git a/src/askademic/orchestrator.py b/src/askademic/orchestrator.py
@@ -96,6 +96,7 @@ async def answer_article(ctx: RunContext[Context], question: str) -> list[str]:
     article_agent = ArticleAgent(
         orchestrator_agent_base.model,
         orchestrator_agent_base.model_settings,
+        use_cache=True  # Enable caching by default
     )
     r = await article_agent.run(request=question)
     return r

diff --git a/src/askademic/tools.py b/src/askademic/tools.py
@@ -1,9 +1,12 @@
+import hashlib
 import json
 import logging
+import os
 import random
 import time
-from datetime import datetime
+from datetime import datetime, timedelta
 from io import BytesIO
+from pathlib import Path
 
 import feedparser
 import pymupdf
@@ -224,14 +227,79 @@ def retrieve_recent_articles(
     return list(df_articles["abstract"][:].values)
 
 
-def get_article(url: str, max_attempts: int = 10) -> str:
+def get_cache_path() -> Path:
+    """Create and return the cache directory path"""
+    cache_dir = Path(os.path.expanduser("~/.askademic/cache"))
+    cache_dir.mkdir(parents=True, exist_ok=True)
+    return cache_dir
+
+
+def get_cache_key(url: str) -> str:
+    """Generate a unique cache key from the URL"""
+    return hashlib.md5(url.encode()).hexdigest()
+
+
+def get_article_from_cache(url: str) -> tuple[bool, str]:
+    """Attempt to retrieve article from cache
+
+    Returns:
+        tuple: (hit, content) where hit is True if cache hit, False otherwise
+    """
+    cache_path = get_cache_path() / f"{get_cache_key(url)}.json"
+
+    if not cache_path.exists():
+        return False, ""
+
+    try:
+        with open(cache_path, "r") as f:
+            cache_data = json.load(f)
+
+        # Check if cache is expired (7 days)
+        timestamp = datetime.fromisoformat(cache_data["timestamp"])
+        if datetime.now() - timestamp > timedelta(days=7):
+            return False, ""
+
+        logger.info(f"{datetime.now()}: Cache hit for {url}")
+        return True, cache_data["content"]
+    except (json.JSONDecodeError, KeyError, ValueError):
+        # Invalid cache file
+        return False, ""
+
+
+def save_article_to_cache(url: str, content: str) -> None:
+    """Save article content to cache"""
+    cache_path = get_cache_path() / f"{get_cache_key(url)}.json"
+
+    cache_data = {
+        "url": url,
+        "timestamp": datetime.now().isoformat(),
+        "content": content
+    }
+
+    try:
+        with open(cache_path, "w") as f:
+            json.dump(cache_data, f)
+        logger.info(f"{datetime.now()}: Saved to cache: {url}")
+    except Exception as e:
+        logger.error(f"{datetime.now()}: Failed to save to cache: {e}")
+
+
+def get_article(url: str, max_attempts: int = 10, use_cache: bool = True) -> str:
     """
     Opens an article using its URL (PDF version) and returns its text content.
+    With caching functionality to avoid repeated downloads.
+
     Args:
         url: the article arXiv URL
         max_attempts: the maximum number of attempts to open the article. Default is 10.
-        Do not change this parameter.
+        use_cache: whether to use cached article if available. Default is True.
     """
+
+    # Try to get from cache first if enabled
+    if use_cache:
+        cache_hit, cached_content = get_article_from_cache(url)
+        if cache_hit:
+            return cached_content
 
     logger.info(f"{datetime.now()}: API URL to retrieve article: {url}")
 
@@ -264,10 +332,14 @@ def get_article(url: str, max_attempts: int = 10) -> str:
     # curtail the article to 70k characters (there can be books, too long)
     article = article[:70000]
 
-    article = f"""
+    formatted_article = f"""
         -------{url}------------
         {article}
         ------END----------------
     """
+
+    # Save to cache if retrieval was successful and not "Article Not Found"
+    if article != "Article Not Found" and use_cache:
+        save_article_to_cache(url, formatted_article)
 
-    return article
+    return formatted_article
diff --git a/tests/test_article.py b/tests/test_article.py
@@ -136,12 +136,12 @@ async def test_article_agent(
             )
         )
         article_agent._get_article.assert_called_once_with(
-            retrieval_response.article_link
+            retrieval_response.article_link, use_cache=True
         )
 
     else:
         article_agent._get_article.assert_called_once_with(
-            request_discriminator_response.article_value
+            request_discriminator_response.article_value, use_cache=True
         )
 
     article_agent._article_agent.run.assert_called_once_with(

diff --git a/tests/test_article_cache.py b/tests/test_article_cache.py
@@ -0,0 +1,100 @@
+import os
+import shutil
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from askademic.tools import (
+    get_article,
+    get_article_from_cache,
+    get_cache_key,
+    get_cache_path,
+    save_article_to_cache,
+)
+
+
+class TestArticleCache:
+    @pytest.fixture
+    def temp_cache_dir(self):
+        # Create a temporary directory
+        temp_dir = tempfile.mkdtemp()
+
+        # Patch the get_cache_path function to return our temp directory
+        with patch("askademic.tools.get_cache_path", return_value=Path(temp_dir)):
+            yield temp_dir
+
+        # Clean up after test
+        shutil.rmtree(temp_dir)
+
+    def test_get_cache_key(self):
+        # Test that the same URL always generates the same key
+        url = "https://arxiv.org/pdf/2401.00001.pdf"
+        key1 = get_cache_key(url)
+        key2 = get_cache_key(url)
+        assert key1 == key2
+
+        # Test that different URLs generate different keys
+        url2 = "https://arxiv.org/pdf/2401.00002.pdf"
+        key3 = get_cache_key(url2)
+        assert key1 != key3
+
+    def test_save_and_retrieve_from_cache(self, temp_cache_dir):
+        url = "https://arxiv.org/pdf/2401.00001.pdf"
+        content = "Test article content"
+
+        # Save to cache
+        save_article_to_cache(url, content)
+
+        # Verify file exists in temp directory
+        cache_file = Path(temp_cache_dir) / f"{get_cache_key(url)}.json"
+        assert cache_file.exists()
+
+        # Retrieve from cache
+        hit, retrieved_content = get_article_from_cache(url)
+        assert hit is True
+        assert retrieved_content == content
+
+    def test_cache_miss(self, temp_cache_dir):
+        url = "https://arxiv.org/pdf/nonexistent.pdf"
+
+        # Try to retrieve non-existent article
+        hit, content = get_article_from_cache(url)
+        assert hit is False
+        assert content == ""
+
+    def test_article_retrieval_with_cache(self, temp_cache_dir):
+        url = "https://arxiv.org/pdf/2401.00001.pdf"
+        test_content = "Test article content"
+        formatted_content = f"""
+        -------{url}------------
+        {test_content}
+        ------END----------------
+    """
+
+        # Directly save content to cache first
+        save_article_to_cache(url, formatted_content)
+
+        # First call should use cache
+        with patch("askademic.tools.requests.get") as mock_get:
+            result1 = get_article(url, use_cache=True)
+            assert not mock_get.called  # Should not make a network call
+            assert result1 == formatted_content
+
+        # Call with cache disabled - should try to use network
+        with patch("askademic.tools.requests.get") as mock_get:
+            mock_response = MagicMock()
+            mock_response.ok = True
+            mock_response.content = b"Test content"
+            mock_get.return_value = mock_response
+
+            with patch("askademic.tools.pymupdf.open") as mock_open:
+                mock_doc = MagicMock()
+                mock_page = MagicMock()
+                mock_page.get_text.return_value = "New test content"
+                mock_doc.__enter__.return_value = [mock_page]
+                mock_open.return_value = mock_doc
+
+                result2 = get_article(url, use_cache=False)
+                assert mock_get.called  # Should make a network call