From de40f79f4273446dd6107b6d478f16efdbd2df79 Mon Sep 17 00:00:00 2001 From: seomis Date: Sun, 22 Nov 2020 20:27:03 +0000 Subject: [PATCH 1/2] [WDT-XXX] using thread pool executor to increase speed of downloadFlickr execution --- download.py | 54 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/download.py b/download.py index 0abd2fc..1aa9396 100644 --- a/download.py +++ b/download.py @@ -10,27 +10,41 @@ import requests from io import BytesIO import sys +import concurrent.futures + + + +file_paths_and_url=[] parser = argparse.ArgumentParser(description='') parser.add_argument('--dataset_path', required=False, default= './data/annotations.json', help='Path to annotations') args = parser.parse_args() dataset_dir = os.path.dirname(args.dataset_path) -print('Note. If for any reason the connection is broken. Just call me again and I will start where I left.') +def downloadImage(url,file_path): + if not os.path.isfile(file_path): + return + + try: + response = requests.get(url_original) + img = Image.open(BytesIO(response.content)) + if img._getexif(): + img.save(file_path, exif=img.info["exif"]) + else: + img.save(file_path) + except Exception as e: + print ("Exception: ",e," at ",url_original,file_path) -# Load annotations with open(args.dataset_path, 'r') as f: annotations = json.loads(f.read()) nr_images = len(annotations['images']) for i in range(nr_images): - image = annotations['images'][i] file_name = image['file_name'] - url_original = image['flickr_url'] - url_resized = image['flickr_640_url'] + url = image['flickr_url'] file_path = os.path.join(dataset_dir, file_name) @@ -39,20 +53,16 @@ if not os.path.isdir(subdir): os.mkdir(subdir) - if not os.path.isfile(file_path): - # Load and Save Image - response = requests.get(url_original) - img = Image.open(BytesIO(response.content)) - if img._getexif(): - img.save(file_path, exif=img.info["exif"]) - else: - img.save(file_path) - - # Show loading bar - bar_size = 30 - x = int(bar_size * i / nr_images) - sys.stdout.write("%s[%s%s] - %i/%i\r" % ('Loading: ', "=" * x, "." * (bar_size - x), i, nr_images)) - sys.stdout.flush() - i+=1 - - sys.stdout.write('Finished\n') + file_paths_and_url.append((url,file_path)) + +with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor: + future_to_url = {executor.submit(downloadImage, *entry): entry for entry in file_paths_and_url} + for future in concurrent.futures.as_completed(future_to_url): + try: + data = future.result() + except Exception as exc: + data = str(type(exc)) + + sys.stdout.write('Finished. The dataset is available under data :) \n') + + From 94031c21be6c6a9db247bf8284a55c96c21cfcf9 Mon Sep 17 00:00:00 2001 From: seomis Date: Sun, 22 Nov 2020 21:33:34 +0000 Subject: [PATCH 2/2] [WDT-XXX] using thread pool executor to increase speed of downloadImage execution --- download.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/download.py b/download.py index 1aa9396..76d5f78 100644 --- a/download.py +++ b/download.py @@ -13,21 +13,20 @@ import concurrent.futures - - file_paths_and_url=[] parser = argparse.ArgumentParser(description='') parser.add_argument('--dataset_path', required=False, default= './data/annotations.json', help='Path to annotations') args = parser.parse_args() dataset_dir = os.path.dirname(args.dataset_path) +nr_images=0 +index=1 -def downloadImage(url,file_path): - if not os.path.isfile(file_path): - return +print('Note. If for any reason the connection is broken. Just call me again and I will start where I left.') +def downloadImage(url,file_path): try: - response = requests.get(url_original) + response = requests.get(url) img = Image.open(BytesIO(response.content)) if img._getexif(): img.save(file_path, exif=img.info["exif"]) @@ -39,7 +38,7 @@ def downloadImage(url,file_path): with open(args.dataset_path, 'r') as f: annotations = json.loads(f.read()) - nr_images = len(annotations['images']) + nr_images += len(annotations['images']) for i in range(nr_images): image = annotations['images'][i] @@ -53,13 +52,21 @@ def downloadImage(url,file_path): if not os.path.isdir(subdir): os.mkdir(subdir) + if os.path.isfile(file_path): + index+=1 + file_paths_and_url.append((url,file_path)) -with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor: +with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: future_to_url = {executor.submit(downloadImage, *entry): entry for entry in file_paths_and_url} for future in concurrent.futures.as_completed(future_to_url): try: data = future.result() + bar_size = 30 + x = int(bar_size * index / nr_images) + sys.stdout.write("%s[%s%s] - %i/%i\r" % ('Loading: ', "=" * x, "." * (bar_size - x), index, nr_images)) + sys.stdout.flush() + index+=1 except Exception as exc: data = str(type(exc))