diff --git a/willisapi_client/__version__.py b/willisapi_client/__version__.py index 324bc88..51826ee 100644 --- a/willisapi_client/__version__.py +++ b/willisapi_client/__version__.py @@ -1,7 +1,7 @@ """Version details for willisapi_client""" __client__ = "willisapi_client" -__latestVersion__ = "1.9.6" +__latestVersion__ = "1.9.7" __url__ = "https://github.com/bklynhlth/willsiapi_client" __short_description__ = "A Python client for willisapi" __content_type__ = "text/markdown" diff --git a/willisapi_client/services/metadata/archive.py b/willisapi_client/services/metadata/archive.py new file mode 100644 index 0000000..cea8395 --- /dev/null +++ b/willisapi_client/services/metadata/archive.py @@ -0,0 +1,83 @@ +import os +import requests + +from willisapi_client.willisapi_client import WillisapiClient +from willisapi_client.logging_setup import logger as logger + + +def _archive_headers(api_key): + return { + "Content-Type": "application/json", + "Accept": "application/json", + "Authorization": f"token {api_key}", + } + + +def archive_metadata_csv(api_key, csv_path, total_rows, upload_type, env=None): + """Archive the source metadata CSV alongside the data upload. + + Creates a server-side tracking row, then PUTs the raw CSV to the returned + presigned S3 URL. Returns the tracking ``record_id`` on success (so the + caller can finalize it with row counts), or ``None`` on failure. + + All failures are logged and swallowed — archiving must never abort the + actual data upload. + """ + try: + wc = WillisapiClient(env=env) + payload = { + "filename": os.path.basename(csv_path), + "total_rows": total_rows, + "upload_type": upload_type, + } + res = requests.post( + wc.get_csv_archive_url(), headers=_archive_headers(api_key), json=payload + ) + if res.status_code != 201: + logger.warning(f"CSV archive init failed: {res.status_code} {res.text}") + return None + + body = res.json() + record_id = body.get("record_id") + presigned = body.get("presigned_url") + if not presigned: + logger.warning("CSV archive: no presigned URL returned") + return None + + with open(csv_path, "rb") as f: + put_res = requests.put( + presigned, data=f, headers={"Content-Type": "text/csv"} + ) + if put_res.status_code not in (200, 204): + logger.warning(f"CSV archive S3 upload failed: {put_res.status_code}") + finalize_metadata_csv(api_key, record_id, "failed", 0, 0, env=env) + return None + + return record_id + except Exception as ex: + logger.warning(f"CSV archive error: {ex}") + return None + + +def finalize_metadata_csv( + api_key, record_id, status, successful_rows, failed_rows, env=None +): + """Report the CSV archive outcome and parsed row counts to the server.""" + if not record_id: + return + try: + wc = WillisapiClient(env=env) + payload = { + "status": status, + "successful_rows": successful_rows, + "failed_rows": failed_rows, + } + res = requests.patch( + wc.get_csv_archive_finalize_url(record_id), + headers=_archive_headers(api_key), + json=payload, + ) + if res.status_code != 200: + logger.warning(f"CSV archive finalize failed: {res.status_code} {res.text}") + except Exception as ex: + logger.warning(f"CSV archive finalize error: {ex}") diff --git a/willisapi_client/services/metadata/language_choices.py b/willisapi_client/services/metadata/language_choices.py index b4678a5..a223171 100644 --- a/willisapi_client/services/metadata/language_choices.py +++ b/willisapi_client/services/metadata/language_choices.py @@ -1,85 +1,123 @@ # https://docs.aws.amazon.com/transcribe/latest/dg/supported-languages.html -Afrikaans = "af-ZA" -Arabic_gulf = "ar-AE" -Arabic_modern_standard = "ar-SA" -Chinese_simplified = "zh-CN" -Chinese_traditional = "zh-TW" -Danish = "da-DK" -Dutch = "nl-NL" -English_aus = "en-AU" -English_birtish = "en-GB" -English_ind = "en-IN" -English_irish = "en-IE" -English_new_zealand = "en-NZ" -English_scottish = "en-AB" -English_south_african = "en-ZA" -English_us = "en-US" -English_welsh = "en-WL" -French = "fr-FR" -French_canadian = "fr-CA" -Farsi = "fa-IR" -German = "de-DE" -German_swiss = "de-CH" -Hebrew = "he-IL" -Hindi_ind = "hi-IN" -Indonesian = "id-ID" -Italian = "it-IT" -Japanese = "ja-JP" -Korean = "ko-KR" -Malay = "ms-MY" -Portuguese = "pt-PT" -Portuguese_brazilian = "pt-BR" -Russian = "ru-RU" -Spanish = "es-ES" -Spanish_us = "es-US" -Swedish = "sv-SE" -Tamil = "ta-IN" -Telugu = "te-IN" -Thai = "th-TH" -Turkish = "tr-TR" -Vietnamese = "vi-VN" - -LANGUAGE_CHOICES = [ - Afrikaans, - Arabic_gulf, - Arabic_modern_standard, - Chinese_simplified, - Chinese_traditional, - Danish, - Dutch, - English_aus, - English_birtish, - English_ind, - English_irish, - English_new_zealand, - English_scottish, - English_south_african, - English_us, - English_welsh, - French, - French_canadian, - Farsi, - German, - German_swiss, - Hebrew, - Hindi_ind, - Indonesian, - Italian, - Japanese, - Korean, - Malay, - Portuguese, - Portuguese_brazilian, - Russian, - Spanish, - Spanish_us, - Swedish, - Tamil, - Telugu, - Thai, - Turkish, - Vietnamese, +# (language_code, display_name) +SUPPORTED_LANGUAGES = [ + ("en-US", "English (US)"), + ("es-US", "Spanish (US)"), + ("ab-GE", "Abkhaz"), + ("af-ZA", "Afrikaans"), + ("sq-AL", "Albanian"), + ("am-ET", "Amharic"), + ("ar-AE", "Arabic, Gulf"), + ("ar-SA", "Arabic, Modern Standard"), + ("hy-AM", "Armenian"), + ("ast-ES", "Asturian"), + ("az-AZ", "Azerbaijani"), + ("ba-RU", "Bashkir"), + ("eu-ES", "Basque"), + ("be-BY", "Belarusian"), + ("bn-IN", "Bengali"), + ("bs-BA", "Bosnian"), + ("my-MM", "Burmese"), + ("bg-BG", "Bulgarian"), + ("ca-ES", "Catalan"), + ("ckb-IR", "Central Kurdish (Iran)"), + ("ckb-IQ", "Central Kurdish (Iraq)"), + ("zh-HK", "Chinese, Cantonese"), + ("zh-CN", "Chinese, Simplified"), + ("zh-TW", "Chinese, Traditional"), + ("hr-HR", "Croatian"), + ("cs-CZ", "Czech"), + ("da-DK", "Danish"), + ("nl-NL", "Dutch"), + ("en-AU", "English, Australian"), + ("en-GB", "English, British"), + ("en-IN", "English, Indian"), + ("en-IE", "English, Irish"), + ("en-NZ", "English, New Zealand"), + ("en-AB", "English, Scottish"), + ("en-ZA", "English, South African"), + ("en-WL", "English, Welsh"), + ("et-EE", "Estonian"), + ("et-ET", "Estonian (et-ET)"), + ("fa-IR", "Farsi"), + ("fa-AF", "Farsi, Afghan"), + ("fi-FI", "Finnish"), + ("fr-FR", "French"), + ("fr-CA", "French, Canadian"), + ("gl-ES", "Galician"), + ("ka-GE", "Georgian"), + ("de-DE", "German"), + ("de-CH", "German, Swiss"), + ("el-GR", "Greek"), + ("gu-IN", "Gujarati"), + ("ht-HT", "Haitian Creole"), + ("ha-NG", "Hausa"), + ("he-IL", "Hebrew"), + ("hi-IN", "Hindi, Indian"), + ("hu-HU", "Hungarian"), + ("is-IS", "Icelandic"), + ("id-ID", "Indonesian"), + ("it-IT", "Italian"), + ("ja-JP", "Japanese"), + ("jv-ID", "Javanese"), + ("kab-DZ", "Kabyle"), + ("kn-IN", "Kannada"), + ("kk-KZ", "Kazakh"), + ("km-KH", "Khmer"), + ("rw-RW", "Kinyarwanda"), + ("ko-KR", "Korean"), + ("ky-KG", "Kyrgyz"), + ("lv-LV", "Latvian"), + ("lt-LT", "Lithuanian"), + ("lg-IN", "Luganda"), + ("mk-MK", "Macedonian"), + ("ms-MY", "Malay"), + ("ml-IN", "Malayalam"), + ("mt-MT", "Maltese"), + ("mr-IN", "Marathi"), + ("mhr-RU", "Meadow Mari"), + ("mn-MN", "Mongolian"), + ("ne-NP", "Nepali"), + ("no-NO", "Norwegian Bokmål"), + ("or-IN", "Odia/Oriya"), + ("ps-AF", "Pashto"), + ("pl-PL", "Polish"), + ("pt-PT", "Portuguese"), + ("pt-BR", "Portuguese, Brazilian"), + ("pa-IN", "Punjabi"), + ("ro-RO", "Romanian"), + ("ru-RU", "Russian"), + ("sr-RS", "Serbian"), + ("si-LK", "Sinhala"), + ("sk-SK", "Slovak"), + ("sl-SI", "Slovenian"), + ("so-SO", "Somali"), + ("es-ES", "Spanish"), + ("es-MX", "Spanish, Mexican"), + ("su-ID", "Sundanese"), + ("sw-KE", "Swahili, Kenya"), + ("sw-BI", "Swahili, Burundi"), + ("sw-RW", "Swahili, Rwanda"), + ("sw-TZ", "Swahili, Tanzania"), + ("sw-UG", "Swahili, Uganda"), + ("sv-SE", "Swedish"), + ("tl-PH", "Tagalog/Filipino"), + ("ta-IN", "Tamil"), + ("tt-RU", "Tatar"), + ("te-IN", "Telugu"), + ("th-TH", "Thai"), + ("tr-TR", "Turkish"), + ("uk-UA", "Ukrainian"), + ("ug-CN", "Uyghur"), + ("uz-UZ", "Uzbek"), + ("vi-VN", "Vietnamese"), + ("cy-WL", "Welsh"), + ("wo-SN", "Wolof"), + ("zu-ZA", "Zulu"), ] + +LANGUAGE_CHOICES = [code for code, _ in SUPPORTED_LANGUAGES] + SEX_CHOICES = [ ("Male", "M"), ("Female", "F"), diff --git a/willisapi_client/services/metadata/upload.py b/willisapi_client/services/metadata/upload.py index 6fc9e67..97a1cb8 100644 --- a/willisapi_client/services/metadata/upload.py +++ b/willisapi_client/services/metadata/upload.py @@ -19,6 +19,10 @@ find_files_with_pattern, get_last_n_directories, ) +from willisapi_client.services.metadata.archive import ( + archive_metadata_csv, + finalize_metadata_csv, +) VALID_SCORE_TYPES = ["rater", "reviewer"] @@ -38,6 +42,15 @@ def upload(api_key: str, csv_path: str, **kwargs): headers["Authorization"] = f"token {api_key}" logger.info(f'{datetime.now().strftime("%H:%M:%S")}: beginning upload') + # Archive the source metadata CSV and open a tracking record. + archive_record_id = archive_metadata_csv( + api_key, + csv_path, + int(csv.transformed_df.shape[0]), + upload_type="data", + env=kwargs.get("env"), + ) + results = [] for index, row in tqdm( csv.transformed_df.iterrows(), total=csv.transformed_df.shape[0] @@ -96,6 +109,16 @@ def upload(api_key: str, csv_path: str, **kwargs): result_row["error"] = f"{err}" results.append(result_row) + successful_rows = sum(1 for r in results if r.get("upload_status") == "Success") + finalize_metadata_csv( + api_key, + archive_record_id, + "successful", + successful_rows, + len(results) - successful_rows, + env=kwargs.get("env"), + ) + results_df = pd.DataFrame(results) return results_df else: @@ -128,6 +151,15 @@ def processed_upload(api_key: str, csv_path: str, output_path: str, **kwargs): headers["Authorization"] = f"token {api_key}" logger.info(f'{datetime.now().strftime("%H:%M:%S")}: beginning upload') + # Archive the source processed-data metadata CSV and open a tracking record. + archive_record_id = archive_metadata_csv( + api_key, + csv_path, + int(csv.transformed_df.shape[0]), + upload_type="processed_data", + env=kwargs.get("env"), + ) + results = [] for index, row in tqdm( csv.transformed_df.iterrows(), total=csv.transformed_df.shape[0] @@ -205,6 +237,16 @@ def processed_upload(api_key: str, csv_path: str, output_path: str, **kwargs): result_row["error"] = f"{err}" results.append(result_row) + successful_rows = sum(1 for r in results if r.get("upload_status") == "Success") + finalize_metadata_csv( + api_key, + archive_record_id, + "successful", + successful_rows, + len(results) - successful_rows, + env=kwargs.get("env"), + ) + results_df = pd.DataFrame(results) return results_df else: diff --git a/willisapi_client/services/metadata/utils.py b/willisapi_client/services/metadata/utils.py index e0a0957..2bf3411 100644 --- a/willisapi_client/services/metadata/utils.py +++ b/willisapi_client/services/metadata/utils.py @@ -421,13 +421,15 @@ def __init__(self, row): def validate_row(self): if not os.path.exists(self.row.file_path): return (False, "File path does not exist") - if self.row.language not in LANGUAGE_CHOICES: - return (False, f"Invalid language: {self.row.language}") + language = getattr(self.row, "language", None) + if language and language not in LANGUAGE_CHOICES: + return (False, f"Invalid language: {language}") return (True, None) def validate_processed_data_row(self): - if self.row.language not in LANGUAGE_CHOICES: - return (False, f"Invalid language: {self.row.language}") + language = getattr(self.row, "language", None) + if language and language not in LANGUAGE_CHOICES: + return (False, f"Invalid language: {language}") return (True, None) def calculate_file_checksum(self, file_path: str) -> str: diff --git a/willisapi_client/willisapi_client.py b/willisapi_client/willisapi_client.py index 1ce4167..5b49f90 100644 --- a/willisapi_client/willisapi_client.py +++ b/willisapi_client/willisapi_client.py @@ -14,18 +14,19 @@ class WillisapiClient: def __init__(self, *args, **kwargs) -> None: self.client_version = get_client_version() self.api_version = math.floor(self.client_version) - self.api_uri = "api.brooklyn.health" + self.api_uri = "api.willis.health" self.env = kwargs["env"] if "env" in kwargs else None - def get_base_url(self): + def get_api_host(self): if self.env: - return f"https://{self.env}-{self.api_uri}/v{self.api_version}/" - return f"https://{self.api_uri}/v{self.api_version}/" + return f"api.{self.env}.willis.health" + return self.api_uri + + def get_base_url(self): + return f"https://{self.get_api_host()}/v{self.api_version}/" def get_base_v2_url(self): - if self.env: - return f"https://{self.env}-{self.api_uri}/api/v2/" - return f"https://{self.api_uri}/api/v2/" + return f"https://{self.get_api_host()}/api/v2/" def get_diarize_remaining_calls_url(self): return self.get_base_url() + "willis-diarize-call-remaining" @@ -35,9 +36,15 @@ def get_diarize(self): def get_upload_url(self): return self.get_base_v2_url() + "metadata/upload" - + def get_processed_upload_url(self): return self.get_base_v2_url() + "metadata/processed-data/upload" + def get_csv_archive_url(self): + return self.get_base_v2_url() + "metadata/csv-archive" + + def get_csv_archive_finalize_url(self, record_id): + return self.get_csv_archive_url() + f"/{record_id}" + def get_headers(self): return {"Content-Type": "application/json", "Accept": "application/json"}