Skip to content

Commit 3ef4619

Browse files
committed
Require secret leaderboard pass for public scores
1 parent 87cddd1 commit 3ef4619

4 files changed

Lines changed: 123 additions & 12 deletions

File tree

src/libkernelbot/leaderboard_db.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -884,6 +884,15 @@ def get_leaderboard_submissions(
884884
AND r.score IS NOT NULL
885885
AND r.passed
886886
AND s.user_id = %s
887+
AND EXISTS (
888+
SELECT 1
889+
FROM leaderboard.runs sr
890+
WHERE sr.submission_id = s.id
891+
AND sr.secret
892+
AND sr.runner = r.runner
893+
AND sr.mode = 'leaderboard'
894+
AND sr.passed
895+
)
887896
AND NOT EXISTS (
888897
SELECT 1
889898
FROM leaderboard.runs sr
@@ -913,6 +922,15 @@ def get_leaderboard_submissions(
913922
JOIN leaderboard.user_info ui ON s.user_id = ui.id
914923
WHERE l.name = %s AND r.runner = %s AND NOT r.secret
915924
AND r.score IS NOT NULL AND r.passed
925+
AND EXISTS (
926+
SELECT 1
927+
FROM leaderboard.runs sr
928+
WHERE sr.submission_id = s.id
929+
AND sr.secret
930+
AND sr.runner = r.runner
931+
AND sr.mode = 'leaderboard'
932+
AND sr.passed
933+
)
916934
AND NOT EXISTS (
917935
SELECT 1
918936
FROM leaderboard.runs sr
@@ -1264,6 +1282,15 @@ def get_user_submissions(
12641282
WHERE submission_id = ANY(%s)
12651283
AND NOT secret
12661284
AND passed
1285+
AND EXISTS (
1286+
SELECT 1
1287+
FROM leaderboard.runs sr
1288+
WHERE sr.submission_id = r.submission_id
1289+
AND sr.secret
1290+
AND sr.runner = r.runner
1291+
AND sr.mode = 'leaderboard'
1292+
AND sr.passed
1293+
)
12671294
AND NOT EXISTS (
12681295
SELECT 1
12691296
FROM leaderboard.runs sr
@@ -1410,6 +1437,15 @@ def get_leaderboard_submission_count(
14101437
AND r.score IS NOT NULL
14111438
AND r.passed
14121439
AND s.user_id = %s
1440+
AND EXISTS (
1441+
SELECT 1
1442+
FROM leaderboard.runs sr
1443+
WHERE sr.submission_id = s.id
1444+
AND sr.secret
1445+
AND sr.runner = r.runner
1446+
AND sr.mode = 'leaderboard'
1447+
AND sr.passed
1448+
)
14131449
AND NOT EXISTS (
14141450
SELECT 1
14151451
FROM leaderboard.runs sr
@@ -1431,6 +1467,15 @@ def get_leaderboard_submission_count(
14311467
AND NOT r.secret
14321468
AND r.score IS NOT NULL
14331469
AND r.passed
1470+
AND EXISTS (
1471+
SELECT 1
1472+
FROM leaderboard.runs sr
1473+
WHERE sr.submission_id = s.id
1474+
AND sr.secret
1475+
AND sr.runner = r.runner
1476+
AND sr.mode = 'leaderboard'
1477+
AND sr.passed
1478+
)
14341479
AND NOT EXISTS (
14351480
SELECT 1
14361481
FROM leaderboard.runs sr

src/libkernelbot/sql/get_hf_export_rows.sql

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,11 @@ WITH ranked AS (
88
s.code_id,
99
s.file_name,
1010
s.submission_time,
11-
COALESCE(
12-
sjs.status,
13-
CASE
14-
WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded'
15-
WHEN s.done THEN 'failed'
16-
ELSE s.status
17-
END
18-
) as status,
11+
CASE
12+
WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded'
13+
WHEN s.done THEN COALESCE(NULLIF(sjs.status, 'succeeded'), 'failed')
14+
ELSE COALESCE(sjs.status, s.status)
15+
END as status,
1916
r.score,
2017
r.passed,
2118
r.mode,
@@ -30,7 +27,26 @@ WITH ranked AS (
3027
LEFT JOIN leaderboard.user_info u ON s.user_id = u.id
3128
LEFT JOIN leaderboard.submission_job_status sjs ON s.id = sjs.submission_id
3229
LEFT JOIN leaderboard.runs r
33-
ON s.id = r.submission_id AND r.mode = 'leaderboard' AND NOT r.secret
30+
ON s.id = r.submission_id
31+
AND r.mode = 'leaderboard'
32+
AND NOT r.secret
33+
AND EXISTS (
34+
SELECT 1
35+
FROM leaderboard.runs sr
36+
WHERE sr.submission_id = s.id
37+
AND sr.secret
38+
AND sr.runner = r.runner
39+
AND sr.mode = 'leaderboard'
40+
AND sr.passed
41+
)
42+
AND NOT EXISTS (
43+
SELECT 1
44+
FROM leaderboard.runs sr
45+
WHERE sr.submission_id = s.id
46+
AND sr.secret
47+
AND sr.runner = r.runner
48+
AND sr.passed = FALSE
49+
)
3450
LEFT JOIN leaderboard.code_files c ON s.code_id = c.id
3551
WHERE s.leaderboard_id = ANY(%s)
3652
)

tests/test_hf_export.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,8 @@ def test_query_filters_secret_runs(self):
181181
get_hf_export_rows(db, [763])
182182
sql = db.cursor.execute.call_args[0][0]
183183
assert "NOT r.secret" in sql
184+
assert "sr.mode = 'leaderboard'" in sql
185+
assert "sr.passed = FALSE" in sql
184186

185187
def test_query_partitions_by_runner(self):
186188
db = MagicMock()
@@ -196,7 +198,7 @@ def test_query_prefers_submission_job_status(self):
196198
get_hf_export_rows(db, [763])
197199
sql = db.cursor.execute.call_args[0][0]
198200
assert "submission_job_status" in sql
199-
assert "COALESCE(" in sql
201+
assert "COALESCE(NULLIF(sjs.status, 'succeeded'), 'failed')" in sql
200202
assert "sjs.status" in sql
201203

202204
def test_query_falls_back_to_derived_status_for_legacy_rows(self):
@@ -205,7 +207,10 @@ def test_query_falls_back_to_derived_status_for_legacy_rows(self):
205207
get_hf_export_rows(db, [763])
206208
sql = db.cursor.execute.call_args[0][0]
207209
assert "WHEN s.done AND r.score IS NOT NULL AND r.passed THEN 'succeeded'" in sql
208-
assert "WHEN s.done THEN 'failed'" in sql
210+
assert "r.score IS NOT NULL" in sql
211+
assert "r.passed" in sql
212+
assert "sr.mode = 'leaderboard'" in sql
213+
assert "WHEN s.done THEN COALESCE(NULLIF(sjs.status, 'succeeded'), 'failed')" in sql
209214

210215
def test_passes_leaderboard_ids_as_param(self):
211216
db = MagicMock()

tests/test_leaderboard_db.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,9 +287,10 @@ def test_leaderboard_submission_count(database, submit_leaderboard):
287287
_create_submission_run(
288288
db, sub_id, mode="leaderboard", secret=False, runner="A100", score=1.5
289289
)
290+
_create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100")
290291
submission = db.get_submission_by_id(sub_id)
291292

292-
assert len(submission["runs"]) == 3
293+
assert len(submission["runs"]) == 4
293294

294295
db.mark_submission_done(sub_id)
295296
with database as db:
@@ -313,30 +314,35 @@ def test_leaderboard_submission_ranked(database, submit_leaderboard):
313314
"submit-leaderboard", "submission.py", 5, dangerous_code, submit_time, user_name="user"
314315
)
315316
_create_submission_run(db, sub_id, mode="leaderboard", runner="A100", score=5.5)
317+
_create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100")
316318
db.mark_submission_done(sub_id)
317319

318320
sub_id = db.create_submission(
319321
"submit-leaderboard", "submission.py", 5, dangerous_code, submit_time, user_name="user"
320322
)
321323
_create_submission_run(db, sub_id, mode="leaderboard", runner="A100", score=4.5)
324+
_create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100")
322325
db.mark_submission_done(sub_id)
323326

324327
sub_id = db.create_submission(
325328
"submit-leaderboard", "submission.py", 5, dangerous_code, submit_time, user_name="user"
326329
)
327330
_create_submission_run(db, sub_id, mode="leaderboard", runner="A100", score=5.0)
331+
_create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100")
328332
db.mark_submission_done(sub_id)
329333

330334
sub_id = db.create_submission(
331335
"submit-leaderboard", "submission.py", 6, dangerous_code, submit_time, user_name="user"
332336
)
333337
_create_submission_run(db, sub_id, mode="leaderboard", runner="A100", score=8.0)
338+
_create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="A100")
334339
db.mark_submission_done(sub_id)
335340

336341
sub_id = db.create_submission(
337342
"submit-leaderboard", "submission.py", 6, dangerous_code, submit_time, user_name="user"
338343
)
339344
_create_submission_run(db, sub_id, mode="leaderboard", runner="H100", score=2.0)
345+
_create_submission_run(db, sub_id, mode="leaderboard", secret=True, runner="H100")
340346
db.mark_submission_done(sub_id)
341347

342348
with database as db:
@@ -435,6 +441,7 @@ def test_failed_secret_benchmark_hides_public_leaderboard_score(database, submit
435441
)
436442
_create_submission_run(db, valid, mode="leaderboard", runner="A100", score=2.0)
437443
_create_submission_run(db, valid, mode="benchmark", secret=True, runner="A100")
444+
_create_submission_run(db, valid, mode="leaderboard", secret=True, runner="A100")
438445
db.mark_submission_done(valid)
439446

440447
with database as db:
@@ -444,6 +451,42 @@ def test_failed_secret_benchmark_hides_public_leaderboard_score(database, submit
444451
assert db.get_leaderboard_submission_count("submit-leaderboard", "A100", "5") == 0
445452

446453

454+
def test_missing_secret_leaderboard_run_hides_public_leaderboard_score(
455+
database, submit_leaderboard
456+
):
457+
submit_time = datetime.datetime.now(tz=datetime.timezone.utc)
458+
459+
with database as db:
460+
public_only = db.create_submission(
461+
"submit-leaderboard", "public_only.py", 5, "fast", submit_time, user_name="user5"
462+
)
463+
_create_submission_run(db, public_only, mode="leaderboard", runner="A100", score=1.0)
464+
db.mark_submission_done(public_only)
465+
466+
secret_test_only = db.create_submission(
467+
"submit-leaderboard", "secret_test_only.py", 6, "fast", submit_time, user_name="user6"
468+
)
469+
_create_submission_run(db, secret_test_only, mode="leaderboard", runner="A100", score=1.5)
470+
_create_submission_run(
471+
db, secret_test_only, mode="test", secret=True, runner="A100"
472+
)
473+
db.mark_submission_done(secret_test_only)
474+
475+
valid = db.create_submission(
476+
"submit-leaderboard", "valid.py", 7, "valid", submit_time, user_name="user7"
477+
)
478+
_create_submission_run(db, valid, mode="leaderboard", runner="A100", score=2.0)
479+
_create_submission_run(db, valid, mode="leaderboard", secret=True, runner="A100")
480+
db.mark_submission_done(valid)
481+
482+
with database as db:
483+
ranked = db.get_leaderboard_submissions("submit-leaderboard", "A100")
484+
assert [row["submission_id"] for row in ranked] == [valid]
485+
assert db.get_leaderboard_submission_count("submit-leaderboard", "A100") == 1
486+
assert db.get_leaderboard_submission_count("submit-leaderboard", "A100", "5") == 0
487+
assert db.get_leaderboard_submission_count("submit-leaderboard", "A100", "6") == 0
488+
489+
447490
def test_failed_secret_run_hides_user_submission_scores(database, submit_leaderboard):
448491
submit_time = datetime.datetime.now(tz=datetime.timezone.utc)
449492
failed_secret = dataclasses.replace(sample_run_result(), passed=False)
@@ -943,7 +986,9 @@ def test_get_user_submissions_with_multiple_runs(database, submit_leaderboard):
943986

944987
# Add multiple runs on different GPUs
945988
_create_submission_run(db, sub1, runner="A100", score=1.5, secret=False)
989+
_create_submission_run(db, sub1, runner="A100", secret=True)
946990
_create_submission_run(db, sub1, runner="H100", score=2.0, secret=False)
991+
_create_submission_run(db, sub1, runner="H100", secret=True)
947992
db.mark_submission_done(sub1)
948993

949994
# Get submissions

0 commit comments

Comments
 (0)