Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ Gemini is preferred because:
* it has a very large context window - this allows to manage the input when many papers are retrieved much more comfortably (note: it is known that )
Claude is experimental because of rate limits and input context limitations. You may see errors for limits exceeded/too many requests etc. There are mechanisms whereby askademic retries, but let us know if something really is off, we're working on improving this.

Support for other LLM families ~will~ might be coming soon.

## Important note about the paper-reading feature

The agent reads the paper via pulling the whole text from arXiv, there isn't an API endpoint giving whole text so this is done via the site. This feature is meant to be used lightly, that is, not reading too many papers at short turnarounds, because this would violates arXiv's terms of use.

To minimize API load, Askademic includes a caching system that saves previously fetched papers for 7 days. This helps reduce the number of requests to arXiv and improves response times for papers you've already accessed.
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perfetto! Berna se non ti spiace cancelli un rimasuglio di prima qui, vedi sopra dove dice "will might come later", cancella magari tutta la frase tanto li abbiamo messi altri llm

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fatto


# Requirements

Works with Python 3.11 and above.
Expand Down
5 changes: 3 additions & 2 deletions src/askademic/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,10 @@ class ArticleRetrievalResponse(BaseModel):


class ArticleAgent:
def __init__(self, model: Model, model_settings: ModelSettings = None):
def __init__(self, model: Model, model_settings: ModelSettings = None, use_cache: bool = True):

self._get_article = get_article
self.use_cache = use_cache
self._search_articles_by_title = search_articles_by_title

self._article_request_discriminator_agent = Agent(
Expand Down Expand Up @@ -128,7 +129,7 @@ async def _answer_question(
Returns:
ArticleResponse: the response with the article content
"""
article = self._get_article(article_link)
article = self._get_article(article_link, use_cache=self.use_cache)
return await self._article_agent.run(
USER_PROMPT_ARTICLE_TEMPLATE.format(request=request, article=article)
)
Expand Down
1 change: 1 addition & 0 deletions src/askademic/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ async def answer_article(ctx: RunContext[Context], question: str) -> list[str]:
article_agent = ArticleAgent(
orchestrator_agent_base.model,
orchestrator_agent_base.model_settings,
use_cache=True # Enable caching by default
)
r = await article_agent.run(request=question)
return r
Expand Down
82 changes: 77 additions & 5 deletions src/askademic/tools.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import hashlib
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hashlib è built-in giusto, non va messa nel pyprog?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sì tutto bultin

import json
import logging
import os
import random
import time
from datetime import datetime
from datetime import datetime, timedelta
from io import BytesIO
from pathlib import Path

import feedparser
import pymupdf
Expand Down Expand Up @@ -224,14 +227,79 @@ def retrieve_recent_articles(
return list(df_articles["abstract"][:].values)


def get_article(url: str, max_attempts: int = 10) -> str:
def get_cache_path() -> Path:
"""Create and return the cache directory path"""
cache_dir = Path(os.path.expanduser("~/.askademic/cache"))
cache_dir.mkdir(parents=True, exist_ok=True)
return cache_dir


def get_cache_key(url: str) -> str:
"""Generate a unique cache key from the URL"""
return hashlib.md5(url.encode()).hexdigest()


def get_article_from_cache(url: str) -> tuple[bool, str]:
"""Attempt to retrieve article from cache

Returns:
tuple: (hit, content) where hit is True if cache hit, False otherwise
"""
cache_path = get_cache_path() / f"{get_cache_key(url)}.json"

if not cache_path.exists():
return False, ""

try:
with open(cache_path, "r") as f:
cache_data = json.load(f)

# Check if cache is expired (7 days)
timestamp = datetime.fromisoformat(cache_data["timestamp"])
if datetime.now() - timestamp > timedelta(days=7):
return False, ""

logger.info(f"{datetime.now()}: Cache hit for {url}")
return True, cache_data["content"]
except (json.JSONDecodeError, KeyError, ValueError):
# Invalid cache file
return False, ""


def save_article_to_cache(url: str, content: str) -> None:
"""Save article content to cache"""
cache_path = get_cache_path() / f"{get_cache_key(url)}.json"

cache_data = {
"url": url,
"timestamp": datetime.now().isoformat(),
"content": content
}

try:
with open(cache_path, "w") as f:
json.dump(cache_data, f)
logger.info(f"{datetime.now()}: Saved to cache: {url}")
except Exception as e:
logger.error(f"{datetime.now()}: Failed to save to cache: {e}")


def get_article(url: str, max_attempts: int = 10, use_cache: bool = True) -> str:
"""
Opens an article using its URL (PDF version) and returns its text content.
With caching functionality to avoid repeated downloads.

Args:
url: the article arXiv URL
max_attempts: the maximum number of attempts to open the article. Default is 10.
Do not change this parameter.
use_cache: whether to use cached article if available. Default is True.
"""

# Try to get from cache first if enabled
if use_cache:
cache_hit, cached_content = get_article_from_cache(url)
if cache_hit:
return cached_content

logger.info(f"{datetime.now()}: API URL to retrieve article: {url}")

Expand Down Expand Up @@ -264,10 +332,14 @@ def get_article(url: str, max_attempts: int = 10) -> str:
# curtail the article to 70k characters (there can be books, too long)
article = article[:70000]

article = f"""
formatted_article = f"""
-------{url}------------
{article}
------END----------------
"""

# Save to cache if retrieval was successful and not "Article Not Found"
if article != "Article Not Found" and use_cache:
save_article_to_cache(url, formatted_article)

return article
return formatted_article
4 changes: 2 additions & 2 deletions tests/test_article.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,12 @@ async def test_article_agent(
)
)
article_agent._get_article.assert_called_once_with(
retrieval_response.article_link
retrieval_response.article_link, use_cache=True
)

else:
article_agent._get_article.assert_called_once_with(
request_discriminator_response.article_value
request_discriminator_response.article_value, use_cache=True
)

article_agent._article_agent.run.assert_called_once_with(
Expand Down
100 changes: 100 additions & 0 deletions tests/test_article_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import os
import shutil
import tempfile
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from askademic.tools import (
get_article,
get_article_from_cache,
get_cache_key,
get_cache_path,
save_article_to_cache,
)


class TestArticleCache:
@pytest.fixture
def temp_cache_dir(self):
# Create a temporary directory
temp_dir = tempfile.mkdtemp()

# Patch the get_cache_path function to return our temp directory
with patch("askademic.tools.get_cache_path", return_value=Path(temp_dir)):
yield temp_dir

# Clean up after test
shutil.rmtree(temp_dir)

def test_get_cache_key(self):
# Test that the same URL always generates the same key
url = "https://arxiv.org/pdf/2401.00001.pdf"
key1 = get_cache_key(url)
key2 = get_cache_key(url)
assert key1 == key2

# Test that different URLs generate different keys
url2 = "https://arxiv.org/pdf/2401.00002.pdf"
key3 = get_cache_key(url2)
assert key1 != key3

def test_save_and_retrieve_from_cache(self, temp_cache_dir):
url = "https://arxiv.org/pdf/2401.00001.pdf"
content = "Test article content"

# Save to cache
save_article_to_cache(url, content)

# Verify file exists in temp directory
cache_file = Path(temp_cache_dir) / f"{get_cache_key(url)}.json"
assert cache_file.exists()

# Retrieve from cache
hit, retrieved_content = get_article_from_cache(url)
assert hit is True
assert retrieved_content == content

def test_cache_miss(self, temp_cache_dir):
url = "https://arxiv.org/pdf/nonexistent.pdf"

# Try to retrieve non-existent article
hit, content = get_article_from_cache(url)
assert hit is False
assert content == ""

def test_article_retrieval_with_cache(self, temp_cache_dir):
url = "https://arxiv.org/pdf/2401.00001.pdf"
test_content = "Test article content"
formatted_content = f"""
-------{url}------------
{test_content}
------END----------------
"""

# Directly save content to cache first
save_article_to_cache(url, formatted_content)

# First call should use cache
with patch("askademic.tools.requests.get") as mock_get:
result1 = get_article(url, use_cache=True)
assert not mock_get.called # Should not make a network call
assert result1 == formatted_content

# Call with cache disabled - should try to use network
with patch("askademic.tools.requests.get") as mock_get:
mock_response = MagicMock()
mock_response.ok = True
mock_response.content = b"Test content"
mock_get.return_value = mock_response

with patch("askademic.tools.pymupdf.open") as mock_open:
mock_doc = MagicMock()
mock_page = MagicMock()
mock_page.get_text.return_value = "New test content"
mock_doc.__enter__.return_value = [mock_page]
mock_open.return_value = mock_doc

result2 = get_article(url, use_cache=False)
assert mock_get.called # Should make a network call