Fix attach urls

dshabin · dshabin · commit 87b96bdefec9 · 2021-04-19T18:36:05.000+04:00
diff --git a/superannotate/__init__.py b/superannotate/__init__.py
@@ -50,7 +50,8 @@ def consensus(*args, **kwargs):
     download_image_preannotations, get_image_annotations, get_image_bytes,
     get_image_metadata, get_image_preannotations, search_images,
     search_images_all_folders, set_image_annotation_status,
-    set_images_annotation_statuses, upload_image_annotations, get_project_root_folder_id
+    set_images_annotation_statuses, upload_image_annotations,
+    get_project_root_folder_id
 )
 from .db.project_api import (
     create_folder, delete_folders, get_folder_metadata,
diff --git a/superannotate/common.py b/superannotate/common.py
@@ -52,11 +52,11 @@
 }
 
 _MODEL_TRAINING_TASKS = {
-    "Instance Segmentation for Pixel Projects" : "instance_segmentation_pixel",
-    "Instance Segmentation for Vector Projects" : "instance_segmentation_vector",
-    "Keypoint Detection for Vector Projects" : "keypoint_detection_vector",
-    "Object Detection for Vector Projects" : "object_detection_vector",
-    "Semantic Segmentation for Pixel Projects" : "semantic_segmentation_pixel"
+    "Instance Segmentation for Pixel Projects": "instance_segmentation_pixel",
+    "Instance Segmentation for Vector Projects": "instance_segmentation_vector",
+    "Keypoint Detection for Vector Projects": "keypoint_detection_vector",
+    "Object Detection for Vector Projects": "object_detection_vector",
+    "Semantic Segmentation for Pixel Projects": "semantic_segmentation_pixel"
 }
 
 
diff --git a/superannotate/db/projects.py b/superannotate/db/projects.py
@@ -38,8 +38,11 @@
     get_project_metadata_with_users
 )
 from .users import get_team_contributor_metadata
-from .utils import _get_upload_auth_token, _get_boto_session_by_credentials, _upload_images, create_empty_annotation, upload_image_array_to_s3, get_image_array_to_upload
+from .utils import _get_upload_auth_token, _get_boto_session_by_credentials, _upload_images, _attach_urls
+from tqdm import tqdm
 
+_NUM_THREADS = 10
+_TIME_TO_UPDATE_IN_TQDM = 1
 logger = logging.getLogger("superannotate-python-sdk")
 
 _api = API.get_instance()
@@ -690,7 +693,6 @@ def attach_image_urls_to_project(
     :return: list of linked image names, list of failed image names, list of duplicate image names
     :rtype: tuple
     """
-
     project, folder = get_project_and_folder_metadata(project)
     folder_name = project["name"] + (f'/{folder["name"]}' if folder else "")
     upload_state = common.upload_state_int_to_str(project.get("upload_state"))
@@ -703,166 +705,30 @@ def attach_image_urls_to_project(
     team_id, project_id = project["team_id"], project["id"]
     image_data = pd.read_csv(attachments, dtype=str)
     image_data = image_data[~image_data["url"].isnull()]
-    existing_names = image_data[~image_data["name"].isnull()]
-    duplicate_idx_csv = existing_names.duplicated(subset="name", keep="first")
-    duplicate_images = existing_names[duplicate_idx_csv]["name"].tolist()
-    existing_names = existing_names[~duplicate_idx_csv]
-    existing_images = search_images((project, folder))
-    duplicate_idx = []
     for ind, _ in image_data[image_data["name"].isnull()].iterrows():
-        while True:
-            name_try = str(uuid.uuid4())
-            if name_try not in existing_images:
-                image_data.at[ind, "name"] = name_try
-                existing_images.append(name_try)
-                break
-    image_data.drop_duplicates(subset="name", keep="first", inplace=True)
-    for ind, row in existing_names.iterrows():
-        if row["name"] in existing_images:
-            duplicate_idx.append(ind)
-    duplicate_images.extend(image_data.loc[duplicate_idx]["name"].tolist())
-    image_data.drop(labels=duplicate_idx, inplace=True)
-    if len(duplicate_images) != 0:
-        logger.warning(
-            "%s already existing images found that won't be uploaded.",
-            len(duplicate_images)
-        )
+        name_try = str(uuid.uuid4())
+        image_data.at[ind, "name"] = name_try
     image_data = pd.DataFrame(image_data, columns=["name", "url"])
     img_names_urls = image_data.values.tolist()
-    logger.info(
-        "Uploading %s images to project %s.", len(img_names_urls), folder_name
-    )
-    if len(img_names_urls) == 0:
-        return ([], [], duplicate_images)
 
     if folder:
         folder_id = folder["id"]
     else:
         folder_id = get_project_root_folder_id(project)
 
-    params = {'team_id': team_id, 'folder_id': folder_id}
-    uploaded = [[] for _ in range(_NUM_THREADS)]
-    tried_upload = [[] for _ in range(_NUM_THREADS)]
-    couldnt_upload = [[] for _ in range(_NUM_THREADS)]
-    finish_event = threading.Event()
-
-    res = _get_upload_auth_token(params=params, project_id=project_id)
-
-    prefix = res['filePath']
-    limit = res['availableImageCount']
-    images_to_upload = img_names_urls[:limit]
-    images_to_skip = img_names_urls[limit:]
-    chunksize = int(math.ceil(len(images_to_upload) / _NUM_THREADS))
-
-    tqdm_thread = threading.Thread(
-        target=__tqdm_thread_image_upload,
-        args=(len(images_to_upload), tried_upload, finish_event),
-        daemon=True
+    list_of_uploaded, list_of_not_uploaded, duplicate_images = _attach_urls(
+        img_names_urls=img_names_urls,
+        team_id=team_id,
+        folder_id=folder_id,
+        project_id=project_id,
+        annotation_status=annotation_status,
+        project=project,
+        folder_name=folder_name
     )
-    tqdm_thread.start()
-    threads = []
-    for thread_id in range(_NUM_THREADS):
-        t = threading.Thread(
-            target=__attach_image_urls_to_project_thread,
-            args=(
-                res, images_to_upload, project, annotation_status, prefix,
-                thread_id, chunksize, couldnt_upload, uploaded, tried_upload,
-                folder_id
-            ),
-            daemon=True
-        )
-        threads.append(t)
-        t.start()
-    for t in threads:
-        t.join()
-    finish_event.set()
-    tqdm_thread.join()
-    list_of_not_uploaded = []
-    for couldnt_upload_thread in couldnt_upload:
-        for f in couldnt_upload_thread:
-            list_of_not_uploaded.append(str(f))
-    list_of_uploaded = []
-    for upload_thread in uploaded:
-        for f in upload_thread:
-            list_of_uploaded.append(str(f))
 
-    list_of_not_uploaded += [i[0] for i in images_to_skip]
     return (list_of_uploaded, list_of_not_uploaded, duplicate_images)
 
 
-def __attach_image_urls_to_project_thread(
-    res, img_names_urls, project, annotation_status, prefix, thread_id,
-    chunksize, couldnt_upload, uploaded, tried_upload, project_folder_id
-):
-    len_img_paths = len(img_names_urls)
-    start_index = thread_id * chunksize
-    end_index = start_index + chunksize
-    if start_index >= len_img_paths:
-        return
-    s3_session = _get_boto_session_by_credentials(res)
-    s3_resource = s3_session.resource('s3')
-    bucket = s3_resource.Bucket(res["bucket"])
-    prefix = res['filePath']
-    uploaded_imgs = []
-    uploaded_imgs_info = ([], [], [])
-    for i in range(start_index, end_index):
-        if i >= len_img_paths:
-            break
-        name, _ = img_names_urls[i]
-        tried_upload[thread_id].append(name)
-        img_name_hash = str(uuid.uuid4()) + Path(name).suffix
-        key = prefix + img_name_hash
-        try:
-            bucket.put_object(
-                Body=json.dumps(create_empty_annotation((None, None), name)),
-                Key=key + ".json"
-            )
-        except Exception as e:
-            logger.warning("Unable to upload image %s. %s", name, e)
-            couldnt_upload[thread_id].append(name)
-            continue
-        else:
-            uploaded_imgs.append(name)
-            uploaded_imgs_info[0].append(img_names_urls[i])
-            uploaded_imgs_info[1].append(key)
-            uploaded_imgs_info[2].append((None, None))
-            if len(uploaded_imgs) >= 100:
-                try:
-                    __create_image(
-                        uploaded_imgs_info[0],
-                        uploaded_imgs_info[1],
-                        project,
-                        annotation_status,
-                        prefix,
-                        uploaded_imgs_info[2],
-                        project_folder_id,
-                        upload_state="External"
-                    )
-                except SABaseException as e:
-                    couldnt_upload[thread_id] += uploaded_imgs
-                    logger.warning(e)
-                else:
-                    uploaded[thread_id] += uploaded_imgs
-                uploaded_imgs = []
-                uploaded_imgs_info = ([], [], [])
-    try:
-        __create_image(
-            uploaded_imgs_info[0],
-            uploaded_imgs_info[1],
-            project,
-            annotation_status,
-            prefix,
-            uploaded_imgs_info[2],
-            project_folder_id,
-            upload_state="External"
-        )
-    except SABaseException as e:
-        couldnt_upload[thread_id] += uploaded_imgs
-        logger.warning(e)
-    else:
-        uploaded[thread_id] += uploaded_imgs
-
-
 def upload_images_from_public_urls_to_project(
     project,
     img_urls,
diff --git a/superannotate/db/search_projects.py b/superannotate/db/search_projects.py
@@ -27,7 +27,7 @@ def search_projects(
     params = {
         'team_id': str(_api.team_id),
         'offset': 0,
-        'limit' : limit,
+        'limit': limit,
         'completeImagesCount': include_complete_image_count
     }
     if name is not None:
diff --git a/superannotate/db/utils.py b/superannotate/db/utils.py

Original file line number	Diff line number	Diff line change
`@@ -52,11 +52,11 @@`
`52`	`52`	`}`
`53`	`53`
`54`	`54`	`_MODEL_TRAINING_TASKS = {`
`55`		`- "Instance Segmentation for Pixel Projects" : "instance_segmentation_pixel",`
`56`		`- "Instance Segmentation for Vector Projects" : "instance_segmentation_vector",`
`57`		`- "Keypoint Detection for Vector Projects" : "keypoint_detection_vector",`
`58`		`- "Object Detection for Vector Projects" : "object_detection_vector",`
`59`		`- "Semantic Segmentation for Pixel Projects" : "semantic_segmentation_pixel"`
	`55`	`+ "Instance Segmentation for Pixel Projects": "instance_segmentation_pixel",`
	`56`	`+ "Instance Segmentation for Vector Projects": "instance_segmentation_vector",`
	`57`	`+ "Keypoint Detection for Vector Projects": "keypoint_detection_vector",`
	`58`	`+ "Object Detection for Vector Projects": "object_detection_vector",`
	`59`	`+ "Semantic Segmentation for Pixel Projects": "semantic_segmentation_pixel"`
`60`	`60`	`}`
`61`	`61`
`62`	`62`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ def search_projects(`
`27`	`27`	`params = {`
`28`	`28`	`'team_id': str(_api.team_id),`
`29`	`29`	`'offset': 0,`
`30`		`- 'limit' : limit,`
	`30`	`+ 'limit': limit,`
`31`	`31`	`'completeImagesCount': include_complete_image_count`
`32`	`32`	`}`
`33`	`33`	`if name is not None:`