berkeley-cdss · phrdang · May 8, 2026 · May 8, 2026 · May 8, 2026 · May 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@
 data/private/
 
 .DS_Store
+.ruff_cache/
diff --git a/src/backups/README.md b/src/backups/README.md
@@ -48,6 +48,38 @@ uv run python3 main.py backup-file-metadata
 > activate and deactivate the virtual environment manually with
 > `source .venv/bin/activate` and `deactivate`, respectively.
 
+> [!TIP]
+> If the `request` command is in progress and you have to leave
+> causing you to lose internet, you can pause the process on Linux/MacOS
+> using `Ctrl + Z` and then resume the process in the foreground with `fg`
+> to prevent losing your progress. (This has only been tested on MacOS
+> and using Eduroam wifi before and after pausing. Switching networks might not work.)
+
+> [!WARNING]
+> OkPy tokens expire within < 12 hours. If you are in the middle of
+> running the `request` command and your token expires, making an API
+> request will cause a 401 Forbidden error and then the script will terminate,
+> losing all your progress. Therefore, we recommend always getting a fresh
+> OkPy token before running the `request` command, especially for courses
+> with > 500 students.
+
+> [!WARNING]
+> If you are running the `request` command in multiple shells at once,
+> anecdotally we have encountered 502 Bad Gateway errors in the OkPy API responses.
+> We recommend limiting yourself to no more than 3 concurrent `request` commands
+> to prevent this issue.
+
+> [!WARNING]
+> VS Code often automatically activates Python virtual environments for you
+> when you open an integrated terminal, which can mess with the virtual environment
+> setup by `uv`. If you're running commands and you see a warning like this:
+> "warning: `VIRTUAL_ENV=/path/to/.venv` does not match the project environment path
+> `.venv` and will be ignored; use `--active` to target the active environment instead"
+> you can safely ignore it as long as you're running the Backups CLI commands in the
+> `src/backups` directory. Do NOT use the `--active` flag because that will use
+> a totally different virtual environment, such as the `src/notebooks/.venv`,
+> and cause errors.
+
 Run `--help` with any of the commands for more information.
 
 Create a configuration file to save yourself the effort of typing a bunch of CLI arguments.
@@ -67,13 +99,24 @@ to be retrieved later by the web app. We recommend one of two methods:
 For both methods, you can refer to the documentation linked above. To save yourself some reading, here is an example
 of the commands you would need to run for method 2, assuming you have [already configured and authenticated through the AWS CLI](https://github.com/berkeley-cdss/assignment-snapshots/tree/main/src/snapshots-app#aws-s3-configuration-and-authentication):
 
-1. `cd` into the folder that you want to upload, replacing `$FILE_PATH` with your desired path, e.g. `cal/cs88/fa25/ants`:
+1. `cd` into the folder that you want to upload, replacing `$FILE_PATH` with your desired path:
 ```sh
 cd data/private/$FILE_PATH
+
+# Example
+cd data/private/cal/cs88/fa25/ants
 ```
-2. Run the following command to synchronize the contents of the folder you are currently inside to the folder in our AWS S3 bucket, replacing `$BUCKET_NAME` with your desired bucket (`ucb-assignment-snapshots-eae254943a2c4f51bef67654e99560dd`) and `$FILE_PATH` with your desired path, e.g. `cal/cs88/fa25/ants`:
+1. Run the following command to synchronize the contents of the folder you are currently inside to the folder in our AWS S3 bucket, replacing `$BUCKET_NAME` with your desired bucket and `$FILE_PATH` with your desired path:
 ```sh
+# Regular sync: Upload all files in current directory to destination path in the bucket
 aws s3 sync . s3://$BUCKET_NAME/$FILE_PATH
+
+# Sync + delete: Same as above + delete any files in destination that aren't in current directory
+aws s3 sync . s3://$BUCKET_NAME/$FILE_PATH --delete
+
+# Examples
+aws s3 sync . s3://ucb-assignment-snapshots-eae254943a2c4f51bef67654e99560dd/cal/cs88/fa25/ants
+aws s3 sync . s3://ucb-assignment-snapshots-eae254943a2c4f51bef67654e99560dd/cal/cs88/fa25/ants --delete
 ```
 
 > [!NOTE]

diff --git a/src/backups/configs/cs61a/fa23.json b/src/backups/configs/cs61a/fa23.json
@@ -1,7 +1,7 @@
 {
     "okpy_api": {
         "course_endpoint": "cal/cs61a/fa23",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/cs61a/fa24.json b/src/backups/configs/cs61a/fa24.json
@@ -1,7 +1,7 @@
 {
     "okpy_api": {
         "course_endpoint": "cal/cs61a/fa24",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/cs61a/fa25.json b/src/backups/configs/cs61a/fa25.json
@@ -1,7 +1,7 @@
 {
     "okpy_api": {
         "course_endpoint": "cal/cs61a/fa25",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/cs61a/sp24.json b/src/backups/configs/cs61a/sp24.json
@@ -2,7 +2,7 @@
     "okpy_api": {
         "course_endpoint": "ucb/cs61a/sp24",
         "sub_course_endpoint": "cal/cs61a/sp24",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/cs61a/sp25.json b/src/backups/configs/cs61a/sp25.json
@@ -1,7 +1,7 @@
 {
     "okpy_api": {
         "course_endpoint": "cal/cs61a/sp25",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/cs61a/su24.json b/src/backups/configs/cs61a/su24.json
@@ -1,7 +1,7 @@
 {
     "okpy_api": {
         "course_endpoint": "cal/cs61a/su24",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/cs61a/su25.json b/src/backups/configs/cs61a/su25.json
@@ -1,7 +1,7 @@
 {
     "okpy_api": {
         "course_endpoint": "cal/cs61a/su25",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/datac88c/fa23.json b/src/backups/configs/datac88c/fa23.json
@@ -1,7 +1,7 @@
 {
     "okpy_api": {
         "course_endpoint": "cal/cs88/fa23",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/datac88c/fa24.json b/src/backups/configs/datac88c/fa24.json
@@ -2,7 +2,7 @@
     "okpy_api": {
         "course_endpoint": "cal/c88c/fa24",
         "sub_course_endpoint": "cal/cs88/fa24",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/datac88c/fa25.json b/src/backups/configs/datac88c/fa25.json
@@ -1,7 +1,7 @@
 {
     "okpy_api": {
         "course_endpoint": "cal/cs88/fa25",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/datac88c/sp24.json b/src/backups/configs/datac88c/sp24.json
@@ -1,7 +1,7 @@
 {
     "okpy_api": {
         "course_endpoint": "cal/cs88/sp24",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/datac88c/sp25.json b/src/backups/configs/datac88c/sp25.json
@@ -1,7 +1,7 @@
 {
     "okpy_api": {
         "course_endpoint": "cal/cs88/sp25",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/datac88c/su24.json b/src/backups/configs/datac88c/su24.json
@@ -2,7 +2,7 @@
     "okpy_api": {
         "course_endpoint": "cal/cs88/sp24",
         "sub_course_endpoint": "cal/cs88/su24",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/configs/datac88c/su25.json b/src/backups/configs/datac88c/su25.json
@@ -1,7 +1,7 @@
 {
     "okpy_api": {
         "course_endpoint": "cal/cs88/su25",
-        "limit": 150,
+        "limit": null,
         "offset": 0
     },
     "course": {

diff --git a/src/backups/main.py b/src/backups/main.py
@@ -144,7 +144,7 @@ def request(
 
     if limit is None:
         limit = config_dict["okpy_api"]["limit"]
-    assert limit >= 0, "limit should be non-negative"
+    assert limit is None or limit >= 0, "limit should be null or non-negative"
 
     if offset is None:
         offset = config_dict["okpy_api"]["offset"]

diff --git a/src/backups/request.py b/src/backups/request.py
@@ -4,17 +4,20 @@
 """
 
 import requests
-from typing import List, Dict
+from typing import List, Dict, Union
 from tqdm import tqdm
 
 BASE_URL = "https://okpy.org/api/v3"
+BACKUP_BATCH_SIZE = 150
+HARD_LIMIT = 600
+ERROR_401_MESSAGE = "OkPy API response had 401 status code. Update your OkPy token in the `.env` file with the result from running `python3 ok --get-token` in any OkPy assignment directory and then try again"
 
 
 def get_backups(
     assignment_endpoint: str,
     email: str,
     access_token: str,
-    limit: int = 150,
+    limit: int = BACKUP_BATCH_SIZE,
     offset: int = 0,
 ) -> requests.Response:
     """
@@ -48,6 +51,7 @@ def get_backups(
         BASE_URL + api_endpoint,
         params=params,
         headers=headers,
+        timeout=30,  # seconds (10 is too low)
     )
 
 
@@ -76,10 +80,17 @@ def get_backups_for_all_assignments(
     hw_start: int,
     hw_end: int,
     projects: List[str],
-    limit: int = 150,
+    limit: Union[int, None] = 150,
     offset: int = 0,
 ) -> dict:
-    """Get backups for all assignments of one particular user"""
+    """
+    Get backups for all assignments of one particular user.
+    If `limit` is `None`, iteratively fetch all their backups (up to `HARD_LIMIT` backups to
+    prevent infinite looping) in batches of `BACKUPS_BATCH_SIZE` and ignore `offset`.
+    """
+    fetch_all = limit is None
+    has_more = True
+
     lab_names = get_all_lab_names(lab_start, lab_end)
     hw_names = get_all_hw_names(hw_start, hw_end)
     all_names = lab_names + hw_names + projects
@@ -90,27 +101,72 @@ def get_backups_for_all_assignments(
         assignment_endpoint = f"{course_endpoint}/{assignment_name}"
 
         try:
-            response = get_backups(
-                assignment_endpoint,
-                email,
-                access_token,
-                limit,
-                offset,
-            )
-
-            if response.status_code == 401:
-                raise RuntimeError(
-                    "OkPy API response had 401 status code. Update your OkPy token in the `.env` file with the result from running `python3 ok --get-token` in any OkPy assignment directory and then try again"
-                )
-
-            if not response.ok:
-                print(
-                    f"Response for user {email}, assignment {assignment_name} did not have OK status code: {response.status_code}: {response.reason}. {response.text}"
+            if not fetch_all:
+                response = get_backups(
+                    assignment_endpoint,
+                    email,
+                    access_token,
+                    limit,
+                    offset,
                 )
 
-            all_responses[assignment_name] = response.json()
+                if response.status_code == 401:
+                    raise RuntimeError(ERROR_401_MESSAGE)
+
+                if not response.ok:
+                    print(
+                        f"Response for user {email}, assignment {assignment_name} did not have OK status code: {response.status_code}: {response.reason}. {response.text}"
+                    )
+
+                all_responses[assignment_name] = response.json()
+            else:
+                merged_responses = None
+                limit = BACKUP_BATCH_SIZE
+                offset = 0
+
+                while has_more and offset < HARD_LIMIT:
+                    response = get_backups(
+                        assignment_endpoint,
+                        email,
+                        access_token,
+                        limit,
+                        offset,
+                    )
+
+                    if response.status_code == 401:
+                        raise RuntimeError(ERROR_401_MESSAGE)
+
+                    if not response.ok:
+                        print(
+                            f"Response for user {email}, assignment {assignment_name} did not have OK status code: {response.status_code}: {response.reason}. {response.text}"
+                        )
+
+                    response = response.json()
+
+                    if merged_responses is None:
+                        merged_responses = response
+                    else:
+                        merged_responses["data"]["backups"].extend(
+                            response["data"]["backups"]
+                        )
+                        merged_responses["data"]["count"] = response["data"]["count"]
+                        merged_responses["data"]["limit"] = response["data"]["limit"]
+                        merged_responses["data"]["offset"] = response["data"]["offset"]
+                        merged_responses["data"]["has_more"] = response["data"][
+                            "has_more"
+                        ]
+
+                        merged_responses["code"] = response["code"]
+                        merged_responses["message"] = response["message"]
+
+                    has_more = response["data"]["has_more"]
+                    offset += BACKUP_BATCH_SIZE
+
+                all_responses[assignment_name] = merged_responses
         except RuntimeError as e:
             raise e
+        except requests.exceptions.Timeout:
+            print(f"Request for {email} timed out, skipping")
         except Exception as e:
             print(
                 f"Exception {type(e)} {e} was raised when getting backup for {email}, skipping"

diff --git a/src/backups/storage.py b/src/backups/storage.py
@@ -8,6 +8,7 @@
 from typing import List, Dict, Callable
 import json
 from datetime import datetime
+from rich.progress import track
 
 from db import (
     CREATE_BACKUP_METADATA_TABLE_CMD,
@@ -338,7 +339,7 @@ def store_backup_file_metadata(
             f"Computed {len(backup_file_metadata_objects)} backup file metadata objects"
         )
 
-    for bfm in backup_file_metadata_objects:
+    for bfm in track(backup_file_metadata_objects, description="Writing backup file metadata to SQLite database...", total=len(backup_file_metadata_objects)):
         insert_backup_file_metadata_record(conn, bfm)
 
     if verbose:
@@ -358,7 +359,7 @@ def responses_to_backups(
     deidentify: bool,
 ) -> int:
     num_backups = 0
-    for student_email, assignment_response_dict in emails_to_responses.items():
+    for student_email, assignment_response_dict in track(emails_to_responses.items(), description="Writing OkPy API output to disk and SQLite database...", total=len(emails_to_responses)):
         for assignment, response in assignment_response_dict.items():
             # NOTE: For older semesters, the Ants project endpoint was 'proj03' instead of 'ants',
             # so here we manually correct it when storing the data for consistency
@@ -413,7 +414,7 @@ def store_lint_errors(
     if verbose:
         print(f"Parsed {len(lint_errors)} lint errors")
 
-    for err in lint_errors:
+    for err in track(lint_errors, description="Writing lint errors to SQLite database...", total=len(lint_errors)):
         insert_lint_error_record(conn, err)
 
     if verbose:

diff --git a/src/notebooks/.gitignore b/src/notebooks/.gitignore
@@ -0,0 +1,2 @@
+# Output from technical_report.ipynb
+results.json
diff --git a/src/notebooks/pyproject.toml b/src/notebooks/pyproject.toml
@@ -6,7 +6,9 @@ dependencies = [
     "matplotlib>=3.10.8",
     "numpy>=2.4.2",
     "pandas>=3.0.0",
+    "plotnine>=0.15.4",
     "seaborn>=0.13.2",
+    "tqdm>=4.67.3",
 ]
 
 [dependency-groups]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Output from technical_report.ipynb
		results.json