Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion willisapi_client/__version__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Version details for willisapi_client"""

__client__ = "willisapi_client"
__latestVersion__ = "1.9.6"
__latestVersion__ = "1.9.7"
__url__ = "https://github.com/bklynhlth/willsiapi_client"
__short_description__ = "A Python client for willisapi"
__content_type__ = "text/markdown"
83 changes: 83 additions & 0 deletions willisapi_client/services/metadata/archive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import os
import requests

from willisapi_client.willisapi_client import WillisapiClient
from willisapi_client.logging_setup import logger as logger


def _archive_headers(api_key):
return {
"Content-Type": "application/json",
"Accept": "application/json",
"Authorization": f"token {api_key}",
}


def archive_metadata_csv(api_key, csv_path, total_rows, upload_type, env=None):
"""Archive the source metadata CSV alongside the data upload.

Creates a server-side tracking row, then PUTs the raw CSV to the returned
presigned S3 URL. Returns the tracking ``record_id`` on success (so the
caller can finalize it with row counts), or ``None`` on failure.

All failures are logged and swallowed — archiving must never abort the
actual data upload.
"""
try:
wc = WillisapiClient(env=env)
payload = {
"filename": os.path.basename(csv_path),
"total_rows": total_rows,
"upload_type": upload_type,
}
res = requests.post(
wc.get_csv_archive_url(), headers=_archive_headers(api_key), json=payload
)
Comment on lines +33 to +35

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Add timeout to prevent indefinite hangs.

The requests.post call lacks a timeout parameter. Without a timeout, this network call could hang indefinitely, violating the documented requirement that "archiving must never abort the actual data upload."

⏱️ Proposed fix to add timeout
         res = requests.post(
-            wc.get_csv_archive_url(), headers=_archive_headers(api_key), json=payload
+            wc.get_csv_archive_url(), 
+            headers=_archive_headers(api_key), 
+            json=payload,
+            timeout=30
         )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
res = requests.post(
wc.get_csv_archive_url(), headers=_archive_headers(api_key), json=payload
)
res = requests.post(
wc.get_csv_archive_url(),
headers=_archive_headers(api_key),
json=payload,
timeout=30
)
🧰 Tools
🪛 Ruff (0.15.15)

[error] 33-33: Probable use of requests call without timeout

(S113)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@willisapi_client/services/metadata/archive.py` around lines 33 - 35, The
requests.post call that creates `res` in
willisapi_client/services/metadata/archive.py must include a timeout to avoid
indefinite hangs; update the `requests.post(wc.get_csv_archive_url(),
headers=_archive_headers(api_key), json=payload)` call to pass a timeout (e.g.,
`timeout=30`) or reference a new module-level constant `ARCHIVE_TIMEOUT`, and
ensure the surrounding code (where `res` is used) continues to handle
failures—optionally catch `requests.exceptions.Timeout` to log/raise a clear
error.

Source: Linters/SAST tools

if res.status_code != 201:
logger.warning(f"CSV archive init failed: {res.status_code} {res.text}")
return None

body = res.json()
record_id = body.get("record_id")
presigned = body.get("presigned_url")
if not presigned:
logger.warning("CSV archive: no presigned URL returned")
return None

with open(csv_path, "rb") as f:
put_res = requests.put(
presigned, data=f, headers={"Content-Type": "text/csv"}
)
Comment on lines +48 to +50

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Add timeout to prevent indefinite hangs.

The requests.put call lacks a timeout parameter. Without a timeout, the S3 upload could hang indefinitely, defeating the non-blocking nature of the archiving flow.

⏱️ Proposed fix to add timeout
             put_res = requests.put(
-                presigned, data=f, headers={"Content-Type": "text/csv"}
+                presigned, 
+                data=f, 
+                headers={"Content-Type": "text/csv"},
+                timeout=60
             )
🧰 Tools
🪛 Ruff (0.15.15)

[error] 48-48: Probable use of requests call without timeout

(S113)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@willisapi_client/services/metadata/archive.py` around lines 48 - 50, The
requests.put call that uploads to S3 (the line creating put_res using presigned
and data=f) lacks a timeout and can hang indefinitely; update that call to
include a sensible timeout argument (e.g., timeout=30) so the upload fails fast,
and ensure any surrounding error handling (where put_res is inspected) still
works with requests raising a Timeout/RequestException.

Source: Linters/SAST tools

if put_res.status_code not in (200, 204):
logger.warning(f"CSV archive S3 upload failed: {put_res.status_code}")
finalize_metadata_csv(api_key, record_id, "failed", 0, 0, env=env)
return None

return record_id
except Exception as ex:
logger.warning(f"CSV archive error: {ex}")
return None


def finalize_metadata_csv(
api_key, record_id, status, successful_rows, failed_rows, env=None
):
"""Report the CSV archive outcome and parsed row counts to the server."""
if not record_id:
return
try:
wc = WillisapiClient(env=env)
payload = {
"status": status,
"successful_rows": successful_rows,
"failed_rows": failed_rows,
}
res = requests.patch(
wc.get_csv_archive_finalize_url(record_id),
headers=_archive_headers(api_key),
json=payload,
)
Comment thread
vjbytes102 marked this conversation as resolved.
if res.status_code != 200:
logger.warning(f"CSV archive finalize failed: {res.status_code} {res.text}")
except Exception as ex:
logger.warning(f"CSV archive finalize error: {ex}")
198 changes: 118 additions & 80 deletions willisapi_client/services/metadata/language_choices.py
Original file line number Diff line number Diff line change
@@ -1,85 +1,123 @@
# https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html
Afrikaans = "af-ZA"
Arabic_gulf = "ar-AE"
Arabic_modern_standard = "ar-SA"
Chinese_simplified = "zh-CN"
Chinese_traditional = "zh-TW"
Danish = "da-DK"
Dutch = "nl-NL"
English_aus = "en-AU"
English_birtish = "en-GB"
English_ind = "en-IN"
English_irish = "en-IE"
English_new_zealand = "en-NZ"
English_scottish = "en-AB"
English_south_african = "en-ZA"
English_us = "en-US"
English_welsh = "en-WL"
French = "fr-FR"
French_canadian = "fr-CA"
Farsi = "fa-IR"
German = "de-DE"
German_swiss = "de-CH"
Hebrew = "he-IL"
Hindi_ind = "hi-IN"
Indonesian = "id-ID"
Italian = "it-IT"
Japanese = "ja-JP"
Korean = "ko-KR"
Malay = "ms-MY"
Portuguese = "pt-PT"
Portuguese_brazilian = "pt-BR"
Russian = "ru-RU"
Spanish = "es-ES"
Spanish_us = "es-US"
Swedish = "sv-SE"
Tamil = "ta-IN"
Telugu = "te-IN"
Thai = "th-TH"
Turkish = "tr-TR"
Vietnamese = "vi-VN"

LANGUAGE_CHOICES = [
Afrikaans,
Arabic_gulf,
Arabic_modern_standard,
Chinese_simplified,
Chinese_traditional,
Danish,
Dutch,
English_aus,
English_birtish,
English_ind,
English_irish,
English_new_zealand,
English_scottish,
English_south_african,
English_us,
English_welsh,
French,
French_canadian,
Farsi,
German,
German_swiss,
Hebrew,
Hindi_ind,
Indonesian,
Italian,
Japanese,
Korean,
Malay,
Portuguese,
Portuguese_brazilian,
Russian,
Spanish,
Spanish_us,
Swedish,
Tamil,
Telugu,
Thai,
Turkish,
Vietnamese,
# (language_code, display_name)
SUPPORTED_LANGUAGES = [
("en-US", "English (US)"),
("es-US", "Spanish (US)"),
("ab-GE", "Abkhaz"),
("af-ZA", "Afrikaans"),
("sq-AL", "Albanian"),
("am-ET", "Amharic"),
("ar-AE", "Arabic, Gulf"),
("ar-SA", "Arabic, Modern Standard"),
("hy-AM", "Armenian"),
("ast-ES", "Asturian"),
("az-AZ", "Azerbaijani"),
("ba-RU", "Bashkir"),
("eu-ES", "Basque"),
("be-BY", "Belarusian"),
("bn-IN", "Bengali"),
("bs-BA", "Bosnian"),
("my-MM", "Burmese"),
("bg-BG", "Bulgarian"),
("ca-ES", "Catalan"),
("ckb-IR", "Central Kurdish (Iran)"),
("ckb-IQ", "Central Kurdish (Iraq)"),
("zh-HK", "Chinese, Cantonese"),
("zh-CN", "Chinese, Simplified"),
("zh-TW", "Chinese, Traditional"),
("hr-HR", "Croatian"),
("cs-CZ", "Czech"),
("da-DK", "Danish"),
("nl-NL", "Dutch"),
("en-AU", "English, Australian"),
("en-GB", "English, British"),
("en-IN", "English, Indian"),
("en-IE", "English, Irish"),
("en-NZ", "English, New Zealand"),
("en-AB", "English, Scottish"),
("en-ZA", "English, South African"),
("en-WL", "English, Welsh"),
("et-EE", "Estonian"),
("et-ET", "Estonian (et-ET)"),
("fa-IR", "Farsi"),
("fa-AF", "Farsi, Afghan"),
("fi-FI", "Finnish"),
("fr-FR", "French"),
("fr-CA", "French, Canadian"),
("gl-ES", "Galician"),
("ka-GE", "Georgian"),
("de-DE", "German"),
("de-CH", "German, Swiss"),
("el-GR", "Greek"),
("gu-IN", "Gujarati"),
("ht-HT", "Haitian Creole"),
("ha-NG", "Hausa"),
("he-IL", "Hebrew"),
("hi-IN", "Hindi, Indian"),
("hu-HU", "Hungarian"),
("is-IS", "Icelandic"),
("id-ID", "Indonesian"),
("it-IT", "Italian"),
("ja-JP", "Japanese"),
("jv-ID", "Javanese"),
("kab-DZ", "Kabyle"),
("kn-IN", "Kannada"),
("kk-KZ", "Kazakh"),
("km-KH", "Khmer"),
("rw-RW", "Kinyarwanda"),
("ko-KR", "Korean"),
("ky-KG", "Kyrgyz"),
("lv-LV", "Latvian"),
("lt-LT", "Lithuanian"),
("lg-IN", "Luganda"),
("mk-MK", "Macedonian"),
("ms-MY", "Malay"),
("ml-IN", "Malayalam"),
("mt-MT", "Maltese"),
("mr-IN", "Marathi"),
("mhr-RU", "Meadow Mari"),
("mn-MN", "Mongolian"),
("ne-NP", "Nepali"),
("no-NO", "Norwegian Bokmål"),
("or-IN", "Odia/Oriya"),
("ps-AF", "Pashto"),
("pl-PL", "Polish"),
("pt-PT", "Portuguese"),
("pt-BR", "Portuguese, Brazilian"),
("pa-IN", "Punjabi"),
("ro-RO", "Romanian"),
("ru-RU", "Russian"),
("sr-RS", "Serbian"),
("si-LK", "Sinhala"),
("sk-SK", "Slovak"),
("sl-SI", "Slovenian"),
("so-SO", "Somali"),
("es-ES", "Spanish"),
("es-MX", "Spanish, Mexican"),
("su-ID", "Sundanese"),
("sw-KE", "Swahili, Kenya"),
("sw-BI", "Swahili, Burundi"),
("sw-RW", "Swahili, Rwanda"),
("sw-TZ", "Swahili, Tanzania"),
("sw-UG", "Swahili, Uganda"),
("sv-SE", "Swedish"),
("tl-PH", "Tagalog/Filipino"),
("ta-IN", "Tamil"),
("tt-RU", "Tatar"),
("te-IN", "Telugu"),
("th-TH", "Thai"),
("tr-TR", "Turkish"),
("uk-UA", "Ukrainian"),
("ug-CN", "Uyghur"),
("uz-UZ", "Uzbek"),
("vi-VN", "Vietnamese"),
("cy-WL", "Welsh"),
("wo-SN", "Wolof"),
("zu-ZA", "Zulu"),
]

LANGUAGE_CHOICES = [code for code, _ in SUPPORTED_LANGUAGES]

SEX_CHOICES = [
("Male", "M"),
("Female", "F"),
Expand Down
42 changes: 42 additions & 0 deletions willisapi_client/services/metadata/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
find_files_with_pattern,
get_last_n_directories,
)
from willisapi_client.services.metadata.archive import (
archive_metadata_csv,
finalize_metadata_csv,
)

VALID_SCORE_TYPES = ["rater", "reviewer"]

Expand All @@ -38,6 +42,15 @@ def upload(api_key: str, csv_path: str, **kwargs):
headers["Authorization"] = f"token {api_key}"
logger.info(f'{datetime.now().strftime("%H:%M:%S")}: beginning upload')

# Archive the source metadata CSV and open a tracking record.
archive_record_id = archive_metadata_csv(
api_key,
csv_path,
int(csv.transformed_df.shape[0]),
upload_type="data",
env=kwargs.get("env"),
)

results = []
for index, row in tqdm(
csv.transformed_df.iterrows(), total=csv.transformed_df.shape[0]
Expand Down Expand Up @@ -96,6 +109,16 @@ def upload(api_key: str, csv_path: str, **kwargs):
result_row["error"] = f"{err}"
results.append(result_row)

successful_rows = sum(1 for r in results if r.get("upload_status") == "Success")
finalize_metadata_csv(
api_key,
archive_record_id,
"successful",
successful_rows,
len(results) - successful_rows,
env=kwargs.get("env"),
)
Comment thread
vjbytes102 marked this conversation as resolved.

results_df = pd.DataFrame(results)
return results_df
else:
Expand Down Expand Up @@ -128,6 +151,15 @@ def processed_upload(api_key: str, csv_path: str, output_path: str, **kwargs):
headers["Authorization"] = f"token {api_key}"
logger.info(f'{datetime.now().strftime("%H:%M:%S")}: beginning upload')

# Archive the source processed-data metadata CSV and open a tracking record.
archive_record_id = archive_metadata_csv(
api_key,
csv_path,
int(csv.transformed_df.shape[0]),
upload_type="processed_data",
env=kwargs.get("env"),
)

results = []
for index, row in tqdm(
csv.transformed_df.iterrows(), total=csv.transformed_df.shape[0]
Expand Down Expand Up @@ -205,6 +237,16 @@ def processed_upload(api_key: str, csv_path: str, output_path: str, **kwargs):
result_row["error"] = f"{err}"
results.append(result_row)

successful_rows = sum(1 for r in results if r.get("upload_status") == "Success")
finalize_metadata_csv(
api_key,
archive_record_id,
"successful",
successful_rows,
len(results) - successful_rows,
env=kwargs.get("env"),
)

results_df = pd.DataFrame(results)
return results_df
else:
Expand Down
10 changes: 6 additions & 4 deletions willisapi_client/services/metadata/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,13 +421,15 @@ def __init__(self, row):
def validate_row(self):
if not os.path.exists(self.row.file_path):
return (False, "File path does not exist")
if self.row.language not in LANGUAGE_CHOICES:
return (False, f"Invalid language: {self.row.language}")
language = getattr(self.row, "language", None)
if language and language not in LANGUAGE_CHOICES:
return (False, f"Invalid language: {language}")
Comment thread
vjbytes102 marked this conversation as resolved.
return (True, None)

def validate_processed_data_row(self):
if self.row.language not in LANGUAGE_CHOICES:
return (False, f"Invalid language: {self.row.language}")
language = getattr(self.row, "language", None)
if language and language not in LANGUAGE_CHOICES:
return (False, f"Invalid language: {language}")
return (True, None)

def calculate_file_checksum(self, file_path: str) -> str:
Expand Down
Loading
Loading