Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion src/models/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,14 +661,39 @@ def _create_model_card_section(self, metadata: Dict[str, Any]) -> Dict[str, Any]
props.append({"name": "genai:aibom:modelcard:quantizationFileType", "value": str(q_dict["file_type"])})
taxonomy_mapped_keys.append("quantization")

# Training Data Completeness Check
has_training_data = self._verify_datasets_available(metadata)
props.append({"name": "genai:aibom:trainingDataAvailable", "value": "true" if has_training_data else "false"})

# Add status note about dataset verification
if has_training_data:
props.append({
"name": "genai:aibom:trainingDataStatus",
"value": "Training datasets verified: Dataset(s) exist and are accessible on Hugging Face Hub."
})
else:
# Dataset referenced but not found/verified
if "datasets" in metadata and metadata.get("datasets"):
props.append({
"name": "genai:aibom:trainingDataWarning",
"value": "Training datasets were referenced but could not be verified on Hugging Face Hub. Dataset may not exist, be disabled, or be inaccessible."
})
else:
# No dataset info at all
props.append({
"name": "genai:aibom:trainingDataWarning",
"value": "Training data information is missing or not documented. This limits transparency and auditability of the model."
})

# Basic Fields we've already mapped to structured homes
mapped_fields = [
"primaryPurpose", "typeOfModel", "suppliedBy", "intendedUse",
"technicalLimitations", "ethicalConsiderations", "datasets", "eval_results",
"pipeline_tag", "name", "author", "license", "description",
"commit", "bomFormat", "specVersion", "version", "licenses",
"external_references", "tags", "library_name", "paper", "downloadLocation",
"gguf_filename", "gguf_license", "model_type", "architectures"
"gguf_filename", "gguf_license", "model_type", "architectures",
"trainingDataAvailable", "trainingDataWarning"
] + taxonomy_mapped_keys

for k, v in metadata.items():
Expand Down Expand Up @@ -719,3 +744,28 @@ def _infer_io_formats(self, task: str) -> tuple:
return (["csv", "json"], ["string", "number"])

return ([], [])

def _verify_datasets_available(self, metadata: Dict[str, Any]) -> bool:
"""Verify if training datasets exist on Hugging Face Hub."""
datasets = metadata.get("datasets")
if not datasets:
return False

# Normalize to list
if isinstance(datasets, str):
datasets = [datasets]
elif isinstance(datasets, dict):
datasets = [datasets.get("name", "")]

# Filter out empty/placeholder values
valid = [d for d in datasets if isinstance(d, str) and d.strip() and d.lower() != "unknown"]

return any(self._verify_dataset_exists_on_hf(d) for d in valid) if valid else False

def _verify_dataset_exists_on_hf(self, dataset_id: str) -> bool:
"""Check if dataset exists and is accessible on HF Hub."""
try:
info = self.hf_api.dataset_info(repo_id=dataset_id)
return info is not None and not getattr(info, 'disabled', False)
except Exception:
return False
103 changes: 103 additions & 0 deletions tests/test_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,5 +119,108 @@ def test_generate_purl_no_namespace(self):
purl = self.service._generate_purl("model", "1.0")
self.assertEqual(purl, "pkg:huggingface/model@1.0")

@patch("src.models.service.calculate_completeness_score")
@patch("src.models.service.EnhancedExtractor")
def test_training_data_flag_with_datasets(self, mock_extractor_cls, mock_score):
"""Test that trainingDataAvailable flag is set to true when datasets are present"""
# Setup
mock_extractor = mock_extractor_cls.return_value
metadata_with_data = {
"name": "test-model",
"datasets": ["dataset1", "dataset2"],
"commit": "123456"
}
mock_extractor.extract_metadata.return_value = metadata_with_data
mock_extractor.extraction_results = {}
mock_score.return_value = {"total_score": 50}

self.service.hf_api.model_info.return_value = MagicMock(sha="123456")

# Mock dataset verification
with patch.object(self.service, '_verify_dataset_exists_on_hf', return_value=True):
# Action
aibom = self.service.generate_aibom("owner/model")

# Verify
model_card = aibom["components"][0].get("modelCard", {})
properties = model_card.get("properties", [])

# Find the trainingDataAvailable property
training_flag = next((p for p in properties if p["name"] == "genai:aibom:trainingDataAvailable"), None)
self.assertIsNotNone(training_flag)
self.assertEqual(training_flag["value"], "true")

# Verify no warning
warning = next((p for p in properties if p["name"] == "genai:aibom:trainingDataWarning"), None)
self.assertIsNone(warning)

@patch("src.models.service.calculate_completeness_score")
@patch("src.models.service.EnhancedExtractor")
def test_training_data_flag_without_datasets(self, mock_extractor_cls, mock_score):
"""Test that trainingDataAvailable flag is set to false and warning is added when datasets are missing"""
# Setup
mock_extractor = mock_extractor_cls.return_value
metadata_no_data = {
"name": "test-model",
"commit": "123456"
# No datasets key
}
mock_extractor.extract_metadata.return_value = metadata_no_data
mock_extractor.extraction_results = {}
mock_score.return_value = {"total_score": 50}

self.service.hf_api.model_info.return_value = MagicMock(sha="123456")

# Action
aibom = self.service.generate_aibom("owner/model")

# Verify
model_card = aibom["components"][0].get("modelCard", {})
properties = model_card.get("properties", [])

# Find the trainingDataAvailable property
training_flag = next((p for p in properties if p["name"] == "genai:aibom:trainingDataAvailable"), None)
self.assertIsNotNone(training_flag)
self.assertEqual(training_flag["value"], "false")

# Verify warning is present
warning = next((p for p in properties if p["name"] == "genai:aibom:trainingDataWarning"), None)
self.assertIsNotNone(warning)
self.assertIn("Training data information is missing", warning["value"])

def test_verify_datasets_available_with_valid_datasets(self):
"""Test dataset verification with valid datasets"""
# Mock the HF API call
with patch.object(self.service, '_verify_dataset_exists_on_hf', return_value=True):
# List of valid datasets
metadata = {"datasets": ["dataset1", "dataset2"]}
self.assertTrue(self.service._verify_datasets_available(metadata))

# Single string dataset
metadata = {"datasets": "valid_dataset"}
self.assertTrue(self.service._verify_datasets_available(metadata))

# Dict format with name
metadata = {"datasets": {"name": "my_dataset", "url": "https://example.com"}}
self.assertTrue(self.service._verify_datasets_available(metadata))

def test_verify_datasets_available_with_empty_datasets(self):
"""Test dataset verification with empty or invalid datasets"""
# Empty list
metadata = {"datasets": []}
self.assertFalse(self.service._verify_datasets_available(metadata))

# List with empty strings
metadata = {"datasets": ["", " ", ""]}
self.assertFalse(self.service._verify_datasets_available(metadata))

# Unknown placeholder
metadata = {"datasets": ["unknown"]}
self.assertFalse(self.service._verify_datasets_available(metadata))

# No datasets key
metadata = {"name": "test-model"}
self.assertFalse(self.service._verify_datasets_available(metadata))

if __name__ == '__main__':
unittest.main()