From de40f79f4273446dd6107b6d478f16efdbd2df79 Mon Sep 17 00:00:00 2001
From: seomis <ppsimoes@worten.pt>
Date: Sun, 22 Nov 2020 20:27:03 +0000
Subject: [PATCH 1/2] [WDT-XXX] using thread pool executor to increase speed of
 downloadFlickr execution

---
 download.py | 54 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/download.py b/download.py
index 0abd2fc..1aa9396 100644
--- a/download.py
+++ b/download.py
@@ -10,27 +10,41 @@
 import requests
 from io import BytesIO
 import sys
+import concurrent.futures
 
+
+
+
+file_paths_and_url=[]
 parser = argparse.ArgumentParser(description='')
 parser.add_argument('--dataset_path', required=False, default= './data/annotations.json', help='Path to annotations')
 args = parser.parse_args()
 
 dataset_dir = os.path.dirname(args.dataset_path)
 
-print('Note. If for any reason the connection is broken. Just call me again and I will start where I left.')
+def downloadImage(url,file_path):
+    if not os.path.isfile(file_path):
+        return
+
+    try:
+        response = requests.get(url_original)
+        img = Image.open(BytesIO(response.content))
+        if img._getexif():
+            img.save(file_path, exif=img.info["exif"])
+        else:
+            img.save(file_path)
+    except Exception as e:
+        print ("Exception: ",e," at ",url_original,file_path)
 
-# Load annotations
 with open(args.dataset_path, 'r') as f:
     annotations = json.loads(f.read())
 
     nr_images = len(annotations['images'])
     for i in range(nr_images):
-
         image = annotations['images'][i]
 
         file_name = image['file_name']
-        url_original = image['flickr_url']
-        url_resized = image['flickr_640_url']
+        url = image['flickr_url']
 
         file_path = os.path.join(dataset_dir, file_name)
 
@@ -39,20 +53,16 @@
         if not os.path.isdir(subdir):
             os.mkdir(subdir)
 
-        if not os.path.isfile(file_path):
-            # Load and Save Image
-            response = requests.get(url_original)
-            img = Image.open(BytesIO(response.content))
-            if img._getexif():
-                img.save(file_path, exif=img.info["exif"])
-            else:
-                img.save(file_path)
-
-        # Show loading bar
-        bar_size = 30
-        x = int(bar_size * i / nr_images)
-        sys.stdout.write("%s[%s%s] - %i/%i\r" % ('Loading: ', "=" * x, "." * (bar_size - x), i, nr_images))
-        sys.stdout.flush()
-        i+=1
-
-    sys.stdout.write('Finished\n')
+        file_paths_and_url.append((url,file_path))
+
+with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
+    future_to_url = {executor.submit(downloadImage, *entry): entry for entry in file_paths_and_url}
+    for future in concurrent.futures.as_completed(future_to_url):
+        try:
+            data = future.result()
+        except Exception as exc:
+            data = str(type(exc))
+
+    sys.stdout.write('Finished. The dataset is available under data :) \n')
+
+

From 94031c21be6c6a9db247bf8284a55c96c21cfcf9 Mon Sep 17 00:00:00 2001
From: seomis <ppsimoes@worten.pt>
Date: Sun, 22 Nov 2020 21:33:34 +0000
Subject: [PATCH 2/2] [WDT-XXX] using thread pool executor to increase speed of
 downloadImage execution

---
 download.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/download.py b/download.py
index 1aa9396..76d5f78 100644
--- a/download.py
+++ b/download.py
@@ -13,21 +13,20 @@
 import concurrent.futures
 
 
-
-
 file_paths_and_url=[]
 parser = argparse.ArgumentParser(description='')
 parser.add_argument('--dataset_path', required=False, default= './data/annotations.json', help='Path to annotations')
 args = parser.parse_args()
 
 dataset_dir = os.path.dirname(args.dataset_path)
+nr_images=0
+index=1
 
-def downloadImage(url,file_path):
-    if not os.path.isfile(file_path):
-        return
+print('Note. If for any reason the connection is broken. Just call me again and I will start where I left.')
 
+def downloadImage(url,file_path):
     try:
-        response = requests.get(url_original)
+        response = requests.get(url)
         img = Image.open(BytesIO(response.content))
         if img._getexif():
             img.save(file_path, exif=img.info["exif"])
@@ -39,7 +38,7 @@ def downloadImage(url,file_path):
 with open(args.dataset_path, 'r') as f:
     annotations = json.loads(f.read())
 
-    nr_images = len(annotations['images'])
+    nr_images += len(annotations['images'])
     for i in range(nr_images):
         image = annotations['images'][i]
 
@@ -53,13 +52,21 @@ def downloadImage(url,file_path):
         if not os.path.isdir(subdir):
             os.mkdir(subdir)
 
+        if os.path.isfile(file_path):
+            index+=1
+
         file_paths_and_url.append((url,file_path))
 
-with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
+with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
     future_to_url = {executor.submit(downloadImage, *entry): entry for entry in file_paths_and_url}
     for future in concurrent.futures.as_completed(future_to_url):
         try:
             data = future.result()
+            bar_size = 30
+            x = int(bar_size * index / nr_images)
+            sys.stdout.write("%s[%s%s] - %i/%i\r" % ('Loading: ', "=" * x, "." * (bar_size - x), index, nr_images))
+            sys.stdout.flush()
+            index+=1
         except Exception as exc:
             data = str(type(exc))