-
Notifications
You must be signed in to change notification settings - Fork 5
Feature/get article cache #147
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,9 +1,12 @@ | ||
| import hashlib | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hashlib è built-in giusto, non va messa nel pyprog?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sì tutto bultin |
||
| import json | ||
| import logging | ||
| import os | ||
| import random | ||
| import time | ||
| from datetime import datetime | ||
| from datetime import datetime, timedelta | ||
| from io import BytesIO | ||
| from pathlib import Path | ||
|
|
||
| import feedparser | ||
| import pymupdf | ||
|
|
@@ -224,14 +227,79 @@ def retrieve_recent_articles( | |
| return list(df_articles["abstract"][:].values) | ||
|
|
||
|
|
||
| def get_article(url: str, max_attempts: int = 10) -> str: | ||
| def get_cache_path() -> Path: | ||
| """Create and return the cache directory path""" | ||
| cache_dir = Path(os.path.expanduser("~/.askademic/cache")) | ||
| cache_dir.mkdir(parents=True, exist_ok=True) | ||
| return cache_dir | ||
|
|
||
|
|
||
| def get_cache_key(url: str) -> str: | ||
| """Generate a unique cache key from the URL""" | ||
| return hashlib.md5(url.encode()).hexdigest() | ||
|
|
||
|
|
||
| def get_article_from_cache(url: str) -> tuple[bool, str]: | ||
| """Attempt to retrieve article from cache | ||
|
|
||
| Returns: | ||
| tuple: (hit, content) where hit is True if cache hit, False otherwise | ||
| """ | ||
| cache_path = get_cache_path() / f"{get_cache_key(url)}.json" | ||
|
|
||
| if not cache_path.exists(): | ||
| return False, "" | ||
|
|
||
| try: | ||
| with open(cache_path, "r") as f: | ||
| cache_data = json.load(f) | ||
|
|
||
| # Check if cache is expired (7 days) | ||
| timestamp = datetime.fromisoformat(cache_data["timestamp"]) | ||
| if datetime.now() - timestamp > timedelta(days=7): | ||
| return False, "" | ||
|
|
||
| logger.info(f"{datetime.now()}: Cache hit for {url}") | ||
| return True, cache_data["content"] | ||
| except (json.JSONDecodeError, KeyError, ValueError): | ||
| # Invalid cache file | ||
| return False, "" | ||
|
|
||
|
|
||
| def save_article_to_cache(url: str, content: str) -> None: | ||
| """Save article content to cache""" | ||
| cache_path = get_cache_path() / f"{get_cache_key(url)}.json" | ||
|
|
||
| cache_data = { | ||
| "url": url, | ||
| "timestamp": datetime.now().isoformat(), | ||
| "content": content | ||
| } | ||
|
|
||
| try: | ||
| with open(cache_path, "w") as f: | ||
| json.dump(cache_data, f) | ||
| logger.info(f"{datetime.now()}: Saved to cache: {url}") | ||
| except Exception as e: | ||
| logger.error(f"{datetime.now()}: Failed to save to cache: {e}") | ||
|
|
||
|
|
||
| def get_article(url: str, max_attempts: int = 10, use_cache: bool = True) -> str: | ||
| """ | ||
| Opens an article using its URL (PDF version) and returns its text content. | ||
| With caching functionality to avoid repeated downloads. | ||
|
|
||
| Args: | ||
| url: the article arXiv URL | ||
| max_attempts: the maximum number of attempts to open the article. Default is 10. | ||
| Do not change this parameter. | ||
| use_cache: whether to use cached article if available. Default is True. | ||
| """ | ||
|
|
||
| # Try to get from cache first if enabled | ||
| if use_cache: | ||
| cache_hit, cached_content = get_article_from_cache(url) | ||
| if cache_hit: | ||
| return cached_content | ||
|
|
||
| logger.info(f"{datetime.now()}: API URL to retrieve article: {url}") | ||
|
|
||
|
|
@@ -264,10 +332,14 @@ def get_article(url: str, max_attempts: int = 10) -> str: | |
| # curtail the article to 70k characters (there can be books, too long) | ||
| article = article[:70000] | ||
|
|
||
| article = f""" | ||
| formatted_article = f""" | ||
| -------{url}------------ | ||
| {article} | ||
| ------END---------------- | ||
| """ | ||
|
|
||
| # Save to cache if retrieval was successful and not "Article Not Found" | ||
| if article != "Article Not Found" and use_cache: | ||
| save_article_to_cache(url, formatted_article) | ||
|
|
||
| return article | ||
| return formatted_article | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| import os | ||
| import shutil | ||
| import tempfile | ||
| from pathlib import Path | ||
| from unittest.mock import MagicMock, patch | ||
|
|
||
| import pytest | ||
|
|
||
| from askademic.tools import ( | ||
| get_article, | ||
| get_article_from_cache, | ||
| get_cache_key, | ||
| get_cache_path, | ||
| save_article_to_cache, | ||
| ) | ||
|
|
||
|
|
||
| class TestArticleCache: | ||
| @pytest.fixture | ||
| def temp_cache_dir(self): | ||
| # Create a temporary directory | ||
| temp_dir = tempfile.mkdtemp() | ||
|
|
||
| # Patch the get_cache_path function to return our temp directory | ||
| with patch("askademic.tools.get_cache_path", return_value=Path(temp_dir)): | ||
| yield temp_dir | ||
|
|
||
| # Clean up after test | ||
| shutil.rmtree(temp_dir) | ||
|
|
||
| def test_get_cache_key(self): | ||
| # Test that the same URL always generates the same key | ||
| url = "https://arxiv.org/pdf/2401.00001.pdf" | ||
| key1 = get_cache_key(url) | ||
| key2 = get_cache_key(url) | ||
| assert key1 == key2 | ||
|
|
||
| # Test that different URLs generate different keys | ||
| url2 = "https://arxiv.org/pdf/2401.00002.pdf" | ||
| key3 = get_cache_key(url2) | ||
| assert key1 != key3 | ||
|
|
||
| def test_save_and_retrieve_from_cache(self, temp_cache_dir): | ||
| url = "https://arxiv.org/pdf/2401.00001.pdf" | ||
| content = "Test article content" | ||
|
|
||
| # Save to cache | ||
| save_article_to_cache(url, content) | ||
|
|
||
| # Verify file exists in temp directory | ||
| cache_file = Path(temp_cache_dir) / f"{get_cache_key(url)}.json" | ||
| assert cache_file.exists() | ||
|
|
||
| # Retrieve from cache | ||
| hit, retrieved_content = get_article_from_cache(url) | ||
| assert hit is True | ||
| assert retrieved_content == content | ||
|
|
||
| def test_cache_miss(self, temp_cache_dir): | ||
| url = "https://arxiv.org/pdf/nonexistent.pdf" | ||
|
|
||
| # Try to retrieve non-existent article | ||
| hit, content = get_article_from_cache(url) | ||
| assert hit is False | ||
| assert content == "" | ||
|
|
||
| def test_article_retrieval_with_cache(self, temp_cache_dir): | ||
| url = "https://arxiv.org/pdf/2401.00001.pdf" | ||
| test_content = "Test article content" | ||
| formatted_content = f""" | ||
| -------{url}------------ | ||
| {test_content} | ||
| ------END---------------- | ||
| """ | ||
|
|
||
| # Directly save content to cache first | ||
| save_article_to_cache(url, formatted_content) | ||
|
|
||
| # First call should use cache | ||
| with patch("askademic.tools.requests.get") as mock_get: | ||
| result1 = get_article(url, use_cache=True) | ||
| assert not mock_get.called # Should not make a network call | ||
| assert result1 == formatted_content | ||
|
|
||
| # Call with cache disabled - should try to use network | ||
| with patch("askademic.tools.requests.get") as mock_get: | ||
| mock_response = MagicMock() | ||
| mock_response.ok = True | ||
| mock_response.content = b"Test content" | ||
| mock_get.return_value = mock_response | ||
|
|
||
| with patch("askademic.tools.pymupdf.open") as mock_open: | ||
| mock_doc = MagicMock() | ||
| mock_page = MagicMock() | ||
| mock_page.get_text.return_value = "New test content" | ||
| mock_doc.__enter__.return_value = [mock_page] | ||
| mock_open.return_value = mock_doc | ||
|
|
||
| result2 = get_article(url, use_cache=False) | ||
| assert mock_get.called # Should make a network call |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perfetto! Berna se non ti spiace cancelli un rimasuglio di prima qui, vedi sopra dove dice "will might come later", cancella magari tutta la frase tanto li abbiamo messi altri llm
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fatto