Skip to content

Commit a144dbd

Browse files
committed
added GenAI csv upload
1 parent 5ab170a commit a144dbd

File tree

16 files changed

+914
-78
lines changed

16 files changed

+914
-78
lines changed

pytest.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,5 @@ minversion = 3.7
33
log_cli=true
44
python_files = test_*.py
55
;pytest_plugins = ['pytest_profiling']
6-
addopts = -n auto --dist=loadscope
6+
;addopts = -n auto --dist=loadscope
77

src/superannotate/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import sys
44

55

6-
__version__ = "4.4.22"
6+
__version__ = "4.4.23dev"
77

88
sys.path.append(os.path.split(os.path.realpath(__file__))[0])
99

src/superannotate/lib/app/helpers.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,50 @@ def get_s3_annotation_paths(folder_path, s3_bucket, annotation_paths, recursive)
7777
return list(set(annotation_paths))
7878

7979

80+
def convert_column_to_lowercase(df, column_name):
81+
actual_column_name = next(
82+
(col for col in df.columns if col.lower() == column_name.lower()), None
83+
)
84+
if actual_column_name:
85+
df = df.rename(columns={actual_column_name: column_name})
86+
else:
87+
raise Exception(f"Column '{column_name}' not found.")
88+
return df
89+
90+
91+
def truncate_long_names(name, length=120):
92+
if len(name) > length:
93+
return name[:length]
94+
else:
95+
return name
96+
97+
98+
def get_gen_ai_csv_data(csv_path):
99+
def serializer_name(val):
100+
if not str(val).strip():
101+
val = str(uuid.uuid4())
102+
val = truncate_long_names(val)
103+
return val
104+
105+
def df_preprocessing(df):
106+
"""
107+
Convert the name column to lowercase
108+
Fill all empty cells with empty strings
109+
Truncating the name column or generating UUID for empties
110+
:param df:
111+
:return: df
112+
"""
113+
df = convert_column_to_lowercase(df, "_item_name")
114+
df = df.fillna("")
115+
df["_item_name"] = df["_item_name"].apply(serializer_name)
116+
return df
117+
118+
df = pd.read_csv(csv_path, engine="python", quotechar='"', dtype=str)
119+
df = df.drop(columns=["_folder"], errors="ignore")
120+
df = df_preprocessing(df)
121+
return df.to_dict(orient="records")
122+
123+
80124
def get_name_url_duplicated_from_csv(csv_path):
81125
image_data = pd.read_csv(csv_path, dtype=str)
82126
image_data.replace({pd.NA: None}, inplace=True)

src/superannotate/lib/app/interface/sdk_interface.py

Lines changed: 120 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import lib.core as constants
3030
from lib.app.helpers import get_annotation_paths
3131
from lib.app.helpers import get_name_url_duplicated_from_csv
32+
from lib.app.helpers import get_gen_ai_csv_data
3233
from lib.app.helpers import wrap_error as wrap_validation_errors
3334
from lib.app.interface.base_interface import BaseInterfaceFacade
3435
from lib.app.interface.base_interface import TrackableMeta
@@ -45,6 +46,7 @@
4546
from lib.core.conditions import Condition
4647
from lib.core.conditions import EmptyCondition
4748
from lib.core.entities import AttachmentEntity
49+
from lib.core.entities import GenAIAttachmentEntity
4850
from lib.core.entities import WorkflowEntity
4951
from lib.core.entities import SettingEntity
5052
from lib.core.entities.classes import AnnotationClassEntity
@@ -112,6 +114,12 @@ class Attachment(TypedDict, total=False):
112114
integration: NotRequired[str] # noqa
113115

114116

117+
class GenAIAttachment(TypedDict, total=False):
118+
_item_name: Optional[str]
119+
_item_category: Optional[str]
120+
# compoenmt id value map
121+
122+
115123
class SAClient(BaseInterfaceFacade, metaclass=TrackableMeta):
116124
"""Create SAClient instance to authorize SDK in a team scope.
117125
In case of no argument has been provided, SA_TOKEN environmental variable
@@ -1187,8 +1195,22 @@ def prepare_export(
11871195
:param only_pinned: enable only pinned output in export. This option disables all other types of output.
11881196
:type only_pinned: bool
11891197
1190-
:param kwargs: Arbitrary kwarg ``integration_name``
1191-
can be provided which will be used as a storage to store export file
1198+
:param kwargs:
1199+
Arbitrary kwargs:
1200+
* integration_name: can be provided which will be used as a storage to store export file
1201+
* format: can be CSV for the Gen AI projects
1202+
1203+
Request Example:
1204+
::
1205+
client = SAClient()
1206+
1207+
export = client.prepare_export(
1208+
project = "Project Name",
1209+
folder_names = ["Folder 1", "Folder 2"],
1210+
annotation_statuses = ["Completed","QualityCheck"],
1211+
export_type = "CSV")
1212+
1213+
client.download_export("Project Name", export, "path_to_download")
11921214
11931215
:return: metadata object of the prepared export
11941216
:rtype: dict
@@ -1216,13 +1238,20 @@ def prepare_export(
12161238
break
12171239
else:
12181240
raise AppException("Integration not found.")
1241+
_export_type = None
1242+
export_type = kwargs.get("format")
1243+
if export_type:
1244+
export_type = export_type.lower()
1245+
if export_type == "csv":
1246+
_export_type = 3
12191247
response = self.controller.prepare_export(
12201248
project_name=project_name,
12211249
folder_names=folders,
12221250
include_fuse=include_fuse,
12231251
only_pinned=only_pinned,
12241252
annotation_statuses=annotation_statuses,
12251253
integration_id=integration_id,
1254+
export_type=_export_type,
12261255
)
12271256
if response.errors:
12281257
raise AppException(response.errors)
@@ -2632,7 +2661,7 @@ def search_items(
26322661
def attach_items(
26332662
self,
26342663
project: Union[NotEmptyStr, dict],
2635-
attachments: Union[NotEmptyStr, Path, conlist(Attachment, min_items=1)],
2664+
attachments: Union[NotEmptyStr, Path, List[dict]],
26362665
annotation_status: Optional[ANNOTATION_STATUS] = "NotStarted",
26372666
):
26382667
"""Link items from external storage to SuperAnnotate using URLs.
@@ -2657,7 +2686,7 @@ def attach_items(
26572686
:return: uploaded, failed and duplicated item names
26582687
:rtype: tuple of list of strs
26592688
2660-
Example for Vector, Video, Document, PointCloud projects:
2689+
Example:
26612690
::
26622691
26632692
client = SAClient()
@@ -2666,96 +2695,120 @@ def attach_items(
26662695
attachments=[{"name": "item", "url": "https://..."}]
26672696
)
26682697
2669-
Example for GenAI projects:
2698+
Example of attaching items from custom integration:
26702699
::
26712700
26722701
client = SAClient()
26732702
client.attach_items(
26742703
project="Medical Annotations",
26752704
attachments=[
26762705
{
2677-
"_item_name": "item",
2678-
"_folder": "QA1",
2679-
"_item_category": "karyology",
2680-
"component_id_0": "val",
2681-
...
2682-
}
2683-
]
2684-
)
2706+
"name": "item",
2707+
"url": "https://bucket-name.s3…/example.png"
2708+
"integration": "custom-integration-name"
2709+
}
2710+
]
2711+
)
26852712
2686-
Example of attaching items from custom integration:
2713+
Example of attaching items for GenAI projects:
26872714
::
26882715
26892716
client = SAClient()
26902717
client.attach_items(
26912718
project="Medical Annotations",
26922719
attachments=[
26932720
{
2694-
"name": "item",
2695-
"url": "https://bucket-name.s3…/example.png"
2696-
"integration": "custom-integration-name"
2721+
"_item_name": "item",
2722+
"_category": "heart",
2723+
"category_text_input": "value1",
2724+
"category_numeric_input": "value1",
2725+
"category_approve_input": 0,
2726+
"category_rating_input": 4,
2727+
"category_slider_input": 23,
2728+
"category_multiselect": ["Option 1"]
2729+
"category_checkbox_input": ["Option 1","Option 3"],
26972730
}
26982731
]
26992732
)
27002733
"""
27012734

27022735
project_name, folder_name = extract_project_folder(project)
2703-
try:
2704-
attachments = parse_obj_as(List[AttachmentEntity], attachments)
2705-
unique_attachments = set(attachments)
2706-
duplicate_attachments = [
2707-
item
2708-
for item, count in collections.Counter(attachments).items()
2709-
if count > 1
2710-
]
2711-
except ValidationError:
2712-
(
2713-
unique_attachments,
2714-
duplicate_attachments,
2715-
) = get_name_url_duplicated_from_csv(attachments)
2716-
if duplicate_attachments:
2717-
logger.info("Dropping duplicates.")
2718-
unique_attachments = parse_obj_as(List[AttachmentEntity], unique_attachments)
2736+
project, folder = self.controller.get_project_folder(project_name, folder_name)
27192737
uploaded, fails, duplicated = [], [], []
2720-
_unique_attachments = []
2721-
if any(i.integration for i in unique_attachments):
2722-
integtation_item_map = {
2723-
i.name: i
2724-
for i in self.controller.integrations.list().data
2725-
if i.type == IntegrationTypeEnum.CUSTOM
2726-
}
2727-
invalid_integrations = set()
2728-
for attachment in unique_attachments:
2729-
if attachment.integration:
2730-
if attachment.integration in integtation_item_map:
2731-
attachment.integration_id = integtation_item_map[
2732-
attachment.integration
2733-
].id
2734-
else:
2735-
invalid_integrations.add(attachment.integration)
2736-
continue
2737-
_unique_attachments.append(attachment)
2738-
if invalid_integrations:
2739-
logger.error(
2740-
f"The ['{','.join(invalid_integrations)}'] integrations specified for the items doesn't exist in the "
2741-
"list of integrations on the platform. Any associated items will be skipped."
2738+
if project.type == ProjectType.GEN_AI.value:
2739+
if isinstance(attachments, (str, Path)):
2740+
attachments = parse_obj_as(
2741+
List[GenAIAttachmentEntity],
2742+
get_gen_ai_csv_data(csv_path=attachments),
27422743
)
2743-
else:
2744-
_unique_attachments = unique_attachments
2745-
2746-
if _unique_attachments:
2747-
logger.info(
2748-
f"Attaching {len(_unique_attachments)} file(s) to project {project}."
2749-
)
2750-
project, folder = self.controller.get_project_folder(
2751-
project_name, folder_name
2752-
)
2753-
response = self.controller.items.attach(
2744+
else:
2745+
attachments = parse_obj_as(List[GenAIAttachmentEntity], attachments)
2746+
response = self.controller.items.attach_gen_ai_data(
27542747
project=project,
27552748
folder=folder,
2756-
attachments=_unique_attachments,
2749+
attachments=attachments,
27572750
annotation_status=annotation_status,
2751+
user=self.controller.current_user,
27582752
)
2753+
uploaded, duplicated, failed = response.data
2754+
else:
2755+
try:
2756+
attachments = parse_obj_as(List[AttachmentEntity], attachments)
2757+
unique_attachments = set(attachments)
2758+
duplicate_attachments = [
2759+
item
2760+
for item, count in collections.Counter(attachments).items()
2761+
if count > 1
2762+
]
2763+
except ValidationError:
2764+
(
2765+
unique_attachments,
2766+
duplicate_attachments,
2767+
) = get_name_url_duplicated_from_csv(attachments)
2768+
if duplicate_attachments:
2769+
logger.info("Dropping duplicates.")
2770+
unique_attachments = parse_obj_as(
2771+
List[AttachmentEntity], unique_attachments
2772+
)
2773+
_unique_attachments = []
2774+
if any(i.integration for i in unique_attachments):
2775+
integtation_item_map = {
2776+
i.name: i
2777+
for i in self.controller.integrations.list().data
2778+
if i.type == IntegrationTypeEnum.CUSTOM
2779+
}
2780+
invalid_integrations = set()
2781+
for attachment in unique_attachments:
2782+
if attachment.integration:
2783+
if attachment.integration in integtation_item_map:
2784+
attachment.integration_id = integtation_item_map[
2785+
attachment.integration
2786+
].id
2787+
else:
2788+
invalid_integrations.add(attachment.integration)
2789+
continue
2790+
_unique_attachments.append(attachment)
2791+
if invalid_integrations:
2792+
logger.error(
2793+
f"The ['{','.join(invalid_integrations)}'] integrations specified for the items doesn't exist in the "
2794+
"list of integrations on the platform. Any associated items will be skipped."
2795+
)
2796+
else:
2797+
_unique_attachments = unique_attachments
2798+
2799+
if _unique_attachments:
2800+
logger.info(
2801+
f"Attaching {len(_unique_attachments)} file(s) to project {project}."
2802+
)
2803+
project, folder = self.controller.get_project_folder(
2804+
project_name, folder_name
2805+
)
2806+
response = self.controller.items.attach(
2807+
project=project,
2808+
folder=folder,
2809+
attachments=_unique_attachments,
2810+
annotation_status=annotation_status,
2811+
)
27592812
if response.errors:
27602813
raise AppException(response.errors)
27612814
uploaded, duplicated = response.data

src/superannotate/lib/core/entities/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from lib.core.entities.items import VideoEntity
1313
from lib.core.entities.project import AttachmentEntity
1414
from lib.core.entities.project import ContributorEntity
15+
from lib.core.entities.project import GenAIAttachmentEntity
1516
from lib.core.entities.project import MLModelEntity
1617
from lib.core.entities.project import ProjectEntity
1718
from lib.core.entities.project import SettingEntity
@@ -37,6 +38,7 @@
3738
"DocumentEntity",
3839
# Utils
3940
"AttachmentEntity",
41+
"GenAIAttachmentEntity",
4042
# project
4143
"ProjectEntity",
4244
"ContributorEntity",

src/superannotate/lib/core/entities/project.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,29 @@ def __hash__(self):
5050
return hash(self.name)
5151

5252

53+
class GenAIAttachmentEntity(BaseModel):
54+
_item_name: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4()))
55+
_item_category: Optional[str] = Field(default=None)
56+
57+
integration: Optional[str] = None
58+
integration_id: Optional[int] = None
59+
60+
@property
61+
def name(self):
62+
return self._item_name
63+
64+
@property
65+
def item_categoty(self):
66+
return self._item_category
67+
68+
class Config:
69+
include_private_fields = True
70+
extra = Extra.allow
71+
72+
def __hash__(self):
73+
return hash(self.name)
74+
75+
5376
class WorkflowEntity(BaseModel):
5477
id: Optional[int]
5578
project_id: Optional[int]

0 commit comments

Comments
 (0)