1+ import os
2+ import json
3+ import argparse
4+ from urllib .parse import urlparse
5+ from huggingface_hub import hf_hub_download , snapshot_download , HfApi
6+
7+
8+ def human_size (num ):
9+ """Convert a file size in bytes to a human-readable string (B, KB, MB, GB, TB)."""
10+ for unit in ["B" , "KB" , "MB" , "GB" , "TB" ]:
11+ if num < 1024.0 :
12+ return f"{ num :3.1f} { unit } "
13+ num /= 1024.0
14+ return f"{ num :.1f} PB"
15+
16+
17+ def fix_hf_url (hf_url ):
18+ """Convert a HuggingFace 'blob' URL to a 'resolve' URL for direct download."""
19+ return hf_url .replace ("/blob/" , "/resolve/" )
20+
21+
22+ def parse_hf_url (hf_url ):
23+ """
24+ Parse a Hugging Face dataset file URL (supports 'blob' or 'resolve' forms).
25+ Returns (repo_id, revision, path_in_repo).
26+ """
27+ url = fix_hf_url (hf_url )
28+ parsed = urlparse (url )
29+ parts = parsed .path .strip ("/" ).split ("/" )
30+
31+ if "datasets" in parts :
32+ datasets_idx = parts .index ("datasets" )
33+ parts = parts [datasets_idx + 1 :]
34+
35+ if len (parts ) < 4 or parts [2 ] != "resolve" :
36+ raise ValueError (f"URL does not look like a valid HuggingFace dataset file URL: { url } " )
37+
38+ repo_id = f"{ parts [0 ]} /{ parts [1 ]} "
39+ revision = parts [3 ]
40+ path_in_repo = "/" .join (parts [4 :])
41+
42+ return repo_id , revision , path_in_repo
43+
44+
45+ def get_json_repo_folder (path_in_repo ):
46+ """
47+ Return the folder containing the JSON inside the repo, or '' if at root.
48+ """
49+ folder = os .path .dirname (path_in_repo )
50+ return folder if folder and folder != "." else ""
51+
52+
53+ def extract_video_paths (osl_json ):
54+ """
55+ Extract video paths from different OSL / SoccerNetPro JSON schemas.
56+
57+ Supported formats:
58+ - videos[].path
59+ - data[].inputs[].path (where type == "video")
60+ """
61+ repo_paths = []
62+
63+ # Legacy / simple format
64+ if "videos" in osl_json :
65+ for v in osl_json .get ("videos" , []):
66+ if "path" in v :
67+ repo_paths .append (v ["path" ].lstrip ("/" ))
68+
69+ # SoccerNetPro / OSL v2 format
70+ elif "data" in osl_json :
71+ for item in osl_json .get ("data" , []):
72+ for inp in item .get ("inputs" , []):
73+ if inp .get ("type" ) == "video" and "path" in inp :
74+ repo_paths .append (inp ["path" ].lstrip ("/" ))
75+
76+ if not repo_paths :
77+ raise ValueError ("No video paths found in the provided OSL JSON." )
78+
79+ return repo_paths
80+
81+
82+ def main (osl_json_url , output_dir = "downloaded_data" , dry_run = False ):
83+ api = HfApi ()
84+
85+ # Parse HuggingFace URL
86+ repo_id , revision , path_in_repo = parse_hf_url (osl_json_url )
87+ repo_json_folder = get_json_repo_folder (path_in_repo )
88+
89+ print (f"⬇️ Downloading OSL JSON from { repo_id } @{ revision } : { path_in_repo } " )
90+ os .makedirs (output_dir , exist_ok = True )
91+
92+ hf_json_path = hf_hub_download (
93+ repo_id = repo_id ,
94+ repo_type = "dataset" ,
95+ filename = path_in_repo ,
96+ revision = revision ,
97+ local_dir = output_dir ,
98+ local_dir_use_symlinks = False ,
99+ )
100+
101+ print (f" → Saved as { hf_json_path } " )
102+
103+ # Load JSON
104+ with open (hf_json_path , "r" ) as f :
105+ osl = json .load (f )
106+
107+ # Extract video paths (schema-aware)
108+ repo_paths = extract_video_paths (osl )
109+ print (f"Found { len (repo_paths )} video files to download." )
110+
111+ def repo_full_path (rel_path ):
112+ if repo_json_folder and not rel_path .startswith (repo_json_folder + "/" ):
113+ return os .path .join (repo_json_folder , rel_path )
114+ return rel_path
115+
116+ # Unique, repo-relative paths
117+ allow_patterns = sorted (set (repo_full_path (p ) for p in repo_paths ))
118+
119+ if dry_run :
120+ print ("Running in DRY-RUN mode (no files will be downloaded)." )
121+
122+ try :
123+ info_obj = api .repo_info (
124+ repo_id = repo_id ,
125+ revision = revision ,
126+ repo_type = "dataset" ,
127+ files_metadata = True ,
128+ )
129+ size_lookup = {f .rfilename : f .size for f in info_obj .siblings }
130+ except Exception as e :
131+ print (f"[ERROR] Could not fetch repo file metadata: { e } " )
132+ size_lookup = {}
133+
134+ total_size = 0
135+ missing_files = []
136+
137+ for full_repo_path in allow_patterns :
138+ local_path = os .path .join (output_dir , full_repo_path )
139+ size = size_lookup .get (full_repo_path )
140+
141+ if size is not None :
142+ size_str = human_size (size )
143+ total_size += size
144+ else :
145+ size_str = "Not found"
146+ missing_files .append (full_repo_path )
147+
148+ print (f"[DRY RUN] Repo file : { full_repo_path } ({ size_str } )" )
149+ print (f"[DRY RUN] Local path: { local_path } " )
150+
151+ print ("-" * 48 )
152+ print (f"Total estimated storage needed: { human_size (total_size )} " )
153+
154+ if missing_files :
155+ print (f"WARNING: { len (missing_files )} files not found in repo:" )
156+ for f in missing_files :
157+ print (f" - { f } " )
158+
159+ else :
160+ print (f"Downloading { len (allow_patterns )} files using snapshot_download..." )
161+ snapshot_download (
162+ repo_id = repo_id ,
163+ repo_type = "dataset" ,
164+ revision = revision ,
165+ local_dir = output_dir ,
166+ allow_patterns = allow_patterns ,
167+ max_workers = 8 ,
168+ )
169+ print (f" → All requested files downloaded to: { output_dir } " )
170+
171+
172+ if __name__ == "__main__" :
173+ parser = argparse .ArgumentParser (description = "Download videos referenced in an OSL JSON from HuggingFace." )
174+ parser .add_argument (
175+ "--url" ,
176+ required = True ,
177+ help = "URL of the OSL JSON file on HuggingFace" ,
178+ )
179+ parser .add_argument (
180+ "--output-dir" ,
181+ default = "downloaded_data" ,
182+ help = "Directory to store downloaded files" ,
183+ )
184+ parser .add_argument (
185+ "--dry-run" ,
186+ action = "store_true" ,
187+ help = "List files to download without downloading them" ,
188+ )
189+
190+ args = parser .parse_args ()
191+ main (args .url , args .output_dir , dry_run = args .dry_run )
0 commit comments