Skip to content

Commit 68e4732

Browse files
Upload download_osl_hf.py
This script downloads an OSL-format annotation JSON file from a Hugging Face
1 parent 3780347 commit 68e4732

1 file changed

Lines changed: 191 additions & 0 deletions

File tree

test_data/download_osl_hf.py

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
import os
2+
import json
3+
import argparse
4+
from urllib.parse import urlparse
5+
from huggingface_hub import hf_hub_download, snapshot_download, HfApi
6+
7+
8+
def human_size(num):
9+
"""Convert a file size in bytes to a human-readable string (B, KB, MB, GB, TB)."""
10+
for unit in ["B", "KB", "MB", "GB", "TB"]:
11+
if num < 1024.0:
12+
return f"{num:3.1f} {unit}"
13+
num /= 1024.0
14+
return f"{num:.1f} PB"
15+
16+
17+
def fix_hf_url(hf_url):
18+
"""Convert a HuggingFace 'blob' URL to a 'resolve' URL for direct download."""
19+
return hf_url.replace("/blob/", "/resolve/")
20+
21+
22+
def parse_hf_url(hf_url):
23+
"""
24+
Parse a Hugging Face dataset file URL (supports 'blob' or 'resolve' forms).
25+
Returns (repo_id, revision, path_in_repo).
26+
"""
27+
url = fix_hf_url(hf_url)
28+
parsed = urlparse(url)
29+
parts = parsed.path.strip("/").split("/")
30+
31+
if "datasets" in parts:
32+
datasets_idx = parts.index("datasets")
33+
parts = parts[datasets_idx + 1 :]
34+
35+
if len(parts) < 4 or parts[2] != "resolve":
36+
raise ValueError(f"URL does not look like a valid HuggingFace dataset file URL: {url}")
37+
38+
repo_id = f"{parts[0]}/{parts[1]}"
39+
revision = parts[3]
40+
path_in_repo = "/".join(parts[4:])
41+
42+
return repo_id, revision, path_in_repo
43+
44+
45+
def get_json_repo_folder(path_in_repo):
46+
"""
47+
Return the folder containing the JSON inside the repo, or '' if at root.
48+
"""
49+
folder = os.path.dirname(path_in_repo)
50+
return folder if folder and folder != "." else ""
51+
52+
53+
def extract_video_paths(osl_json):
54+
"""
55+
Extract video paths from different OSL / SoccerNetPro JSON schemas.
56+
57+
Supported formats:
58+
- videos[].path
59+
- data[].inputs[].path (where type == "video")
60+
"""
61+
repo_paths = []
62+
63+
# Legacy / simple format
64+
if "videos" in osl_json:
65+
for v in osl_json.get("videos", []):
66+
if "path" in v:
67+
repo_paths.append(v["path"].lstrip("/"))
68+
69+
# SoccerNetPro / OSL v2 format
70+
elif "data" in osl_json:
71+
for item in osl_json.get("data", []):
72+
for inp in item.get("inputs", []):
73+
if inp.get("type") == "video" and "path" in inp:
74+
repo_paths.append(inp["path"].lstrip("/"))
75+
76+
if not repo_paths:
77+
raise ValueError("No video paths found in the provided OSL JSON.")
78+
79+
return repo_paths
80+
81+
82+
def main(osl_json_url, output_dir="downloaded_data", dry_run=False):
83+
api = HfApi()
84+
85+
# Parse HuggingFace URL
86+
repo_id, revision, path_in_repo = parse_hf_url(osl_json_url)
87+
repo_json_folder = get_json_repo_folder(path_in_repo)
88+
89+
print(f"⬇️ Downloading OSL JSON from {repo_id}@{revision}: {path_in_repo}")
90+
os.makedirs(output_dir, exist_ok=True)
91+
92+
hf_json_path = hf_hub_download(
93+
repo_id=repo_id,
94+
repo_type="dataset",
95+
filename=path_in_repo,
96+
revision=revision,
97+
local_dir=output_dir,
98+
local_dir_use_symlinks=False,
99+
)
100+
101+
print(f" → Saved as {hf_json_path}")
102+
103+
# Load JSON
104+
with open(hf_json_path, "r") as f:
105+
osl = json.load(f)
106+
107+
# Extract video paths (schema-aware)
108+
repo_paths = extract_video_paths(osl)
109+
print(f"Found {len(repo_paths)} video files to download.")
110+
111+
def repo_full_path(rel_path):
112+
if repo_json_folder and not rel_path.startswith(repo_json_folder + "/"):
113+
return os.path.join(repo_json_folder, rel_path)
114+
return rel_path
115+
116+
# Unique, repo-relative paths
117+
allow_patterns = sorted(set(repo_full_path(p) for p in repo_paths))
118+
119+
if dry_run:
120+
print("Running in DRY-RUN mode (no files will be downloaded).")
121+
122+
try:
123+
info_obj = api.repo_info(
124+
repo_id=repo_id,
125+
revision=revision,
126+
repo_type="dataset",
127+
files_metadata=True,
128+
)
129+
size_lookup = {f.rfilename: f.size for f in info_obj.siblings}
130+
except Exception as e:
131+
print(f"[ERROR] Could not fetch repo file metadata: {e}")
132+
size_lookup = {}
133+
134+
total_size = 0
135+
missing_files = []
136+
137+
for full_repo_path in allow_patterns:
138+
local_path = os.path.join(output_dir, full_repo_path)
139+
size = size_lookup.get(full_repo_path)
140+
141+
if size is not None:
142+
size_str = human_size(size)
143+
total_size += size
144+
else:
145+
size_str = "Not found"
146+
missing_files.append(full_repo_path)
147+
148+
print(f"[DRY RUN] Repo file : {full_repo_path} ({size_str})")
149+
print(f"[DRY RUN] Local path: {local_path}")
150+
151+
print("-" * 48)
152+
print(f"Total estimated storage needed: {human_size(total_size)}")
153+
154+
if missing_files:
155+
print(f"WARNING: {len(missing_files)} files not found in repo:")
156+
for f in missing_files:
157+
print(f" - {f}")
158+
159+
else:
160+
print(f"Downloading {len(allow_patterns)} files using snapshot_download...")
161+
snapshot_download(
162+
repo_id=repo_id,
163+
repo_type="dataset",
164+
revision=revision,
165+
local_dir=output_dir,
166+
allow_patterns=allow_patterns,
167+
max_workers=8,
168+
)
169+
print(f" → All requested files downloaded to: {output_dir}")
170+
171+
172+
if __name__ == "__main__":
173+
parser = argparse.ArgumentParser(description="Download videos referenced in an OSL JSON from HuggingFace.")
174+
parser.add_argument(
175+
"--url",
176+
required=True,
177+
help="URL of the OSL JSON file on HuggingFace",
178+
)
179+
parser.add_argument(
180+
"--output-dir",
181+
default="downloaded_data",
182+
help="Directory to store downloaded files",
183+
)
184+
parser.add_argument(
185+
"--dry-run",
186+
action="store_true",
187+
help="List files to download without downloading them",
188+
)
189+
190+
args = parser.parse_args()
191+
main(args.url, args.output_dir, dry_run=args.dry_run)

0 commit comments

Comments
 (0)