diff --git a/download.py b/download.py index 0abd2fc..76d5f78 100644 --- a/download.py +++ b/download.py @@ -10,27 +10,40 @@ import requests from io import BytesIO import sys +import concurrent.futures + +file_paths_and_url=[] parser = argparse.ArgumentParser(description='') parser.add_argument('--dataset_path', required=False, default= './data/annotations.json', help='Path to annotations') args = parser.parse_args() dataset_dir = os.path.dirname(args.dataset_path) +nr_images=0 +index=1 print('Note. If for any reason the connection is broken. Just call me again and I will start where I left.') -# Load annotations +def downloadImage(url,file_path): + try: + response = requests.get(url) + img = Image.open(BytesIO(response.content)) + if img._getexif(): + img.save(file_path, exif=img.info["exif"]) + else: + img.save(file_path) + except Exception as e: + print ("Exception: ",e," at ",url_original,file_path) + with open(args.dataset_path, 'r') as f: annotations = json.loads(f.read()) - nr_images = len(annotations['images']) + nr_images += len(annotations['images']) for i in range(nr_images): - image = annotations['images'][i] file_name = image['file_name'] - url_original = image['flickr_url'] - url_resized = image['flickr_640_url'] + url = image['flickr_url'] file_path = os.path.join(dataset_dir, file_name) @@ -39,20 +52,24 @@ if not os.path.isdir(subdir): os.mkdir(subdir) - if not os.path.isfile(file_path): - # Load and Save Image - response = requests.get(url_original) - img = Image.open(BytesIO(response.content)) - if img._getexif(): - img.save(file_path, exif=img.info["exif"]) - else: - img.save(file_path) - - # Show loading bar - bar_size = 30 - x = int(bar_size * i / nr_images) - sys.stdout.write("%s[%s%s] - %i/%i\r" % ('Loading: ', "=" * x, "." * (bar_size - x), i, nr_images)) - sys.stdout.flush() - i+=1 - - sys.stdout.write('Finished\n') + if os.path.isfile(file_path): + index+=1 + + file_paths_and_url.append((url,file_path)) + +with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor: + future_to_url = {executor.submit(downloadImage, *entry): entry for entry in file_paths_and_url} + for future in concurrent.futures.as_completed(future_to_url): + try: + data = future.result() + bar_size = 30 + x = int(bar_size * index / nr_images) + sys.stdout.write("%s[%s%s] - %i/%i\r" % ('Loading: ', "=" * x, "." * (bar_size - x), index, nr_images)) + sys.stdout.flush() + index+=1 + except Exception as exc: + data = str(type(exc)) + + sys.stdout.write('Finished. The dataset is available under data :) \n') + +