diff --git a/README.md b/README.md index c15da96..0ec7954 100644 --- a/README.md +++ b/README.md @@ -31,12 +31,12 @@ Gemini is preferred because: * it has a very large context window - this allows to manage the input when many papers are retrieved much more comfortably (note: it is known that ) Claude is experimental because of rate limits and input context limitations. You may see errors for limits exceeded/too many requests etc. There are mechanisms whereby askademic retries, but let us know if something really is off, we're working on improving this. -Support for other LLM families ~will~ might be coming soon. - ## Important note about the paper-reading feature The agent reads the paper via pulling the whole text from arXiv, there isn't an API endpoint giving whole text so this is done via the site. This feature is meant to be used lightly, that is, not reading too many papers at short turnarounds, because this would violates arXiv's terms of use. +To minimize API load, Askademic includes a caching system that saves previously fetched papers for 7 days. This helps reduce the number of requests to arXiv and improves response times for papers you've already accessed. + # Requirements Works with Python 3.11 and above. diff --git a/src/askademic/article.py b/src/askademic/article.py index 66e4d4a..c115247 100644 --- a/src/askademic/article.py +++ b/src/askademic/article.py @@ -61,9 +61,10 @@ class ArticleRetrievalResponse(BaseModel): class ArticleAgent: - def __init__(self, model: Model, model_settings: ModelSettings = None): + def __init__(self, model: Model, model_settings: ModelSettings = None, use_cache: bool = True): self._get_article = get_article + self.use_cache = use_cache self._search_articles_by_title = search_articles_by_title self._article_request_discriminator_agent = Agent( @@ -128,7 +129,7 @@ async def _answer_question( Returns: ArticleResponse: the response with the article content """ - article = self._get_article(article_link) + article = self._get_article(article_link, use_cache=self.use_cache) return await self._article_agent.run( USER_PROMPT_ARTICLE_TEMPLATE.format(request=request, article=article) ) diff --git a/src/askademic/orchestrator.py b/src/askademic/orchestrator.py index 9f52161..faf64b7 100644 --- a/src/askademic/orchestrator.py +++ b/src/askademic/orchestrator.py @@ -96,6 +96,7 @@ async def answer_article(ctx: RunContext[Context], question: str) -> list[str]: article_agent = ArticleAgent( orchestrator_agent_base.model, orchestrator_agent_base.model_settings, + use_cache=True # Enable caching by default ) r = await article_agent.run(request=question) return r diff --git a/src/askademic/tools.py b/src/askademic/tools.py index 3076a90..ad348fc 100644 --- a/src/askademic/tools.py +++ b/src/askademic/tools.py @@ -1,9 +1,12 @@ +import hashlib import json import logging +import os import random import time -from datetime import datetime +from datetime import datetime, timedelta from io import BytesIO +from pathlib import Path import feedparser import pymupdf @@ -224,14 +227,79 @@ def retrieve_recent_articles( return list(df_articles["abstract"][:].values) -def get_article(url: str, max_attempts: int = 10) -> str: +def get_cache_path() -> Path: + """Create and return the cache directory path""" + cache_dir = Path(os.path.expanduser("~/.askademic/cache")) + cache_dir.mkdir(parents=True, exist_ok=True) + return cache_dir + + +def get_cache_key(url: str) -> str: + """Generate a unique cache key from the URL""" + return hashlib.md5(url.encode()).hexdigest() + + +def get_article_from_cache(url: str) -> tuple[bool, str]: + """Attempt to retrieve article from cache + + Returns: + tuple: (hit, content) where hit is True if cache hit, False otherwise + """ + cache_path = get_cache_path() / f"{get_cache_key(url)}.json" + + if not cache_path.exists(): + return False, "" + + try: + with open(cache_path, "r") as f: + cache_data = json.load(f) + + # Check if cache is expired (7 days) + timestamp = datetime.fromisoformat(cache_data["timestamp"]) + if datetime.now() - timestamp > timedelta(days=7): + return False, "" + + logger.info(f"{datetime.now()}: Cache hit for {url}") + return True, cache_data["content"] + except (json.JSONDecodeError, KeyError, ValueError): + # Invalid cache file + return False, "" + + +def save_article_to_cache(url: str, content: str) -> None: + """Save article content to cache""" + cache_path = get_cache_path() / f"{get_cache_key(url)}.json" + + cache_data = { + "url": url, + "timestamp": datetime.now().isoformat(), + "content": content + } + + try: + with open(cache_path, "w") as f: + json.dump(cache_data, f) + logger.info(f"{datetime.now()}: Saved to cache: {url}") + except Exception as e: + logger.error(f"{datetime.now()}: Failed to save to cache: {e}") + + +def get_article(url: str, max_attempts: int = 10, use_cache: bool = True) -> str: """ Opens an article using its URL (PDF version) and returns its text content. + With caching functionality to avoid repeated downloads. + Args: url: the article arXiv URL max_attempts: the maximum number of attempts to open the article. Default is 10. - Do not change this parameter. + use_cache: whether to use cached article if available. Default is True. """ + + # Try to get from cache first if enabled + if use_cache: + cache_hit, cached_content = get_article_from_cache(url) + if cache_hit: + return cached_content logger.info(f"{datetime.now()}: API URL to retrieve article: {url}") @@ -264,10 +332,14 @@ def get_article(url: str, max_attempts: int = 10) -> str: # curtail the article to 70k characters (there can be books, too long) article = article[:70000] - article = f""" + formatted_article = f""" -------{url}------------ {article} ------END---------------- """ + + # Save to cache if retrieval was successful and not "Article Not Found" + if article != "Article Not Found" and use_cache: + save_article_to_cache(url, formatted_article) - return article + return formatted_article diff --git a/tests/test_article.py b/tests/test_article.py index 3eb60a7..d724a50 100644 --- a/tests/test_article.py +++ b/tests/test_article.py @@ -136,12 +136,12 @@ async def test_article_agent( ) ) article_agent._get_article.assert_called_once_with( - retrieval_response.article_link + retrieval_response.article_link, use_cache=True ) else: article_agent._get_article.assert_called_once_with( - request_discriminator_response.article_value + request_discriminator_response.article_value, use_cache=True ) article_agent._article_agent.run.assert_called_once_with( diff --git a/tests/test_article_cache.py b/tests/test_article_cache.py new file mode 100644 index 0000000..6d5d61b --- /dev/null +++ b/tests/test_article_cache.py @@ -0,0 +1,100 @@ +import os +import shutil +import tempfile +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from askademic.tools import ( + get_article, + get_article_from_cache, + get_cache_key, + get_cache_path, + save_article_to_cache, +) + + +class TestArticleCache: + @pytest.fixture + def temp_cache_dir(self): + # Create a temporary directory + temp_dir = tempfile.mkdtemp() + + # Patch the get_cache_path function to return our temp directory + with patch("askademic.tools.get_cache_path", return_value=Path(temp_dir)): + yield temp_dir + + # Clean up after test + shutil.rmtree(temp_dir) + + def test_get_cache_key(self): + # Test that the same URL always generates the same key + url = "https://arxiv.org/pdf/2401.00001.pdf" + key1 = get_cache_key(url) + key2 = get_cache_key(url) + assert key1 == key2 + + # Test that different URLs generate different keys + url2 = "https://arxiv.org/pdf/2401.00002.pdf" + key3 = get_cache_key(url2) + assert key1 != key3 + + def test_save_and_retrieve_from_cache(self, temp_cache_dir): + url = "https://arxiv.org/pdf/2401.00001.pdf" + content = "Test article content" + + # Save to cache + save_article_to_cache(url, content) + + # Verify file exists in temp directory + cache_file = Path(temp_cache_dir) / f"{get_cache_key(url)}.json" + assert cache_file.exists() + + # Retrieve from cache + hit, retrieved_content = get_article_from_cache(url) + assert hit is True + assert retrieved_content == content + + def test_cache_miss(self, temp_cache_dir): + url = "https://arxiv.org/pdf/nonexistent.pdf" + + # Try to retrieve non-existent article + hit, content = get_article_from_cache(url) + assert hit is False + assert content == "" + + def test_article_retrieval_with_cache(self, temp_cache_dir): + url = "https://arxiv.org/pdf/2401.00001.pdf" + test_content = "Test article content" + formatted_content = f""" + -------{url}------------ + {test_content} + ------END---------------- + """ + + # Directly save content to cache first + save_article_to_cache(url, formatted_content) + + # First call should use cache + with patch("askademic.tools.requests.get") as mock_get: + result1 = get_article(url, use_cache=True) + assert not mock_get.called # Should not make a network call + assert result1 == formatted_content + + # Call with cache disabled - should try to use network + with patch("askademic.tools.requests.get") as mock_get: + mock_response = MagicMock() + mock_response.ok = True + mock_response.content = b"Test content" + mock_get.return_value = mock_response + + with patch("askademic.tools.pymupdf.open") as mock_open: + mock_doc = MagicMock() + mock_page = MagicMock() + mock_page.get_text.return_value = "New test content" + mock_doc.__enter__.return_value = [mock_page] + mock_open.return_value = mock_doc + + result2 = get_article(url, use_cache=False) + assert mock_get.called # Should make a network call \ No newline at end of file