-
-
Notifications
You must be signed in to change notification settings - Fork 326
fix: replace manual anomalies with a hampel filter #1997
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,129 +1,90 @@ | ||
| import type { ChartTimeGranularity, EvolutionData } from '~/types/chart' | ||
| import { DOWNLOAD_ANOMALIES } from './download-anomalies.data' | ||
| import type { EvolutionData } from '~/types/chart' | ||
|
|
||
| export type DownloadAnomalyBound = { | ||
| date: string // YYYY-MM-DD | ||
| weeklyDownloads: number | ||
| } | ||
| /** | ||
| * Hampel filter for automatic anomaly detection and correction. | ||
| * | ||
| * For each data point, computes the median and Median Absolute Deviation (MAD) | ||
| * of a surrounding window. Points deviating more than `threshold` MADs from | ||
| * the local median are flagged as anomalies and replaced with the median. | ||
| * | ||
| * This approach is unbiased — it applies the same statistical test to every | ||
| * package equally, with no manual curation. | ||
| */ | ||
|
|
||
| export type DownloadAnomaly = { | ||
| packageName: string | ||
| start: DownloadAnomalyBound | ||
| end: DownloadAnomalyBound | ||
| } | ||
| const DEFAULT_HALF_WINDOW = 3 | ||
| const DEFAULT_THRESHOLD = 3 | ||
|
|
||
| function getDateString(point: Record<string, any>, granularity: ChartTimeGranularity): string { | ||
| switch (granularity) { | ||
| case 'daily': | ||
| return point.day | ||
| case 'weekly': | ||
| return point.weekStart | ||
| case 'monthly': | ||
| return `${point.month}-01` | ||
| case 'yearly': | ||
| return `${point.year}-01-01` | ||
| } | ||
| function median(values: number[]): number { | ||
| const sorted = [...values].sort((a, b) => a - b) | ||
| const mid = Math.floor(sorted.length / 2) | ||
| return sorted.length % 2 !== 0 ? sorted[mid]! : (sorted[mid - 1]! + sorted[mid]!) / 2 | ||
| } | ||
|
|
||
| /** | ||
| * For daily the point date falls strictly between the anomaly bounds. | ||
| * For weekly the point date is the week start, and the full 7-day range is | ||
| * checked so any overlapping week is affected. | ||
| * For monthly/yearly the anomaly bounds are truncated to the same resolution | ||
| * so that any period overlapping the anomaly is caught (inclusive). | ||
| */ | ||
| function isDateAffected( | ||
| date: string, | ||
| anomaly: DownloadAnomaly, | ||
| granularity: ChartTimeGranularity, | ||
| ): boolean { | ||
| switch (granularity) { | ||
| case 'daily': | ||
| return date > anomaly.start.date && date < anomaly.end.date | ||
| case 'weekly': { | ||
| const startWeek = date | ||
| const weekStartDate = new Date(`${date}T00:00:00Z`) | ||
| const weekEndDate = new Date(weekStartDate) | ||
| weekEndDate.setUTCDate(weekEndDate.getUTCDate() + 6) | ||
| const endWeek = weekEndDate.toISOString().slice(0, 10) | ||
| return startWeek <= anomaly.end.date && endWeek >= anomaly.start.date | ||
| } | ||
| case 'monthly': { | ||
| const startMonth = anomaly.start.date.slice(0, 7) + '-01' | ||
| const endMonth = anomaly.end.date.slice(0, 7) + '-01' | ||
| return date >= startMonth && date <= endMonth | ||
| } | ||
| case 'yearly': { | ||
| const startYear = anomaly.start.date.slice(0, 4) + '-01-01' | ||
| const endYear = anomaly.end.date.slice(0, 4) + '-01-01' | ||
| return date >= startYear && date <= endYear | ||
| } | ||
| } | ||
| function mad(values: number[], med: number): number { | ||
| const deviations = values.map(v => Math.abs(v - med)) | ||
| return median(deviations) | ||
| } | ||
|
|
||
| function scaleWeeklyValue(weeklyValue: number, granularity: ChartTimeGranularity): number { | ||
| switch (granularity) { | ||
| case 'daily': | ||
| return Math.round(weeklyValue / 7) | ||
| case 'weekly': | ||
| return weeklyValue | ||
| case 'monthly': | ||
| return Math.round((weeklyValue / 7) * 30) | ||
| case 'yearly': | ||
| return Math.round((weeklyValue / 7) * 365) | ||
| } | ||
| } | ||
| export function applyHampelCorrection( | ||
| data: EvolutionData, | ||
| opts?: { halfWindow?: number; threshold?: number }, | ||
| ): EvolutionData { | ||
| // halfWindow controls how many neighbors on each side to consider. | ||
| // A window of 3 means we look at 7 points total (3 left + current + 3 right). | ||
| const halfWindow = opts?.halfWindow ?? DEFAULT_HALF_WINDOW | ||
|
|
||
| export function getAnomaliesForPackages( | ||
| packageNames: string[], | ||
| ): { packageName: string; start: string; end: string }[] { | ||
| return DOWNLOAD_ANOMALIES.filter(a => packageNames.includes(a.packageName)).map(a => ({ | ||
| packageName: a.packageName, | ||
| start: a.start.date, | ||
| end: a.end.date, | ||
| })) | ||
| } | ||
| // threshold controls sensitivity. A value of 3 means a point must deviate | ||
| // more than 3 scaled MADs from the local median to be flagged. | ||
| // Higher = less sensitive, lower = more aggressive filtering. | ||
| const threshold = opts?.threshold ?? DEFAULT_THRESHOLD | ||
|
|
||
| export function applyBlocklistCorrection(opts: { | ||
| data: EvolutionData | ||
| packageName: string | ||
| granularity: ChartTimeGranularity | ||
| }): EvolutionData { | ||
| const { data, packageName, granularity } = opts | ||
| const anomalies = DOWNLOAD_ANOMALIES.filter(a => a.packageName === packageName) | ||
| if (!anomalies.length) return data | ||
| // Not enough data to form a full window — return as-is. | ||
| if (data.length < halfWindow * 2 + 1) return data | ||
|
|
||
| // Clone to avoid mutation | ||
| const values = (data as Array<{ value: number }>).map(d => d.value) | ||
| // Clone to avoid mutating the original data. | ||
| const result = (data as Array<Record<string, any>>).map(d => ({ ...d })) | ||
|
|
||
| for (const anomaly of anomalies) { | ||
| // Find indices of affected points | ||
| const affectedIndices: number[] = [] | ||
| for (let i = 0; i < result.length; i++) { | ||
| const date = getDateString(result[i]!, granularity) | ||
| if (isDateAffected(date, anomaly, granularity)) { | ||
| affectedIndices.push(i) | ||
| } | ||
| } | ||
| for (let i = 0; i < values.length; i++) { | ||
| // Build a sliding window around the current point, clamped to array bounds. | ||
| const start = Math.max(0, i - halfWindow) | ||
| const end = Math.min(values.length - 1, i + halfWindow) | ||
| const window = values.slice(start, end + 1) | ||
|
|
||
| if (!affectedIndices.length) continue | ||
| // The median is robust to outliers — unlike the mean, a single spike | ||
| // won't pull it away from the true central tendency. | ||
| const windowMedian = median(window) | ||
|
|
||
| const firstAffected = affectedIndices[0]! | ||
| const lastAffected = affectedIndices[affectedIndices.length - 1]! | ||
| // MAD (Median Absolute Deviation) measures spread without being | ||
| // influenced by the outliers we're trying to detect. | ||
| const windowMad = mad(window, windowMedian) | ||
|
|
||
| // Use neighbors when available, fall back to scaled weeklyDownloads | ||
| const scaledStart = scaleWeeklyValue(anomaly.start.weeklyDownloads, granularity) | ||
| const scaledEnd = scaleWeeklyValue(anomaly.end.weeklyDownloads, granularity) | ||
| // How far this point is from the local median. | ||
| const deviation = Math.abs(values[i]! - windowMedian) | ||
|
|
||
| const startVal = firstAffected > 0 ? result[firstAffected - 1]!.value : scaledStart | ||
| const endVal = lastAffected < result.length - 1 ? result[lastAffected + 1]!.value : scaledEnd | ||
| // MAD of 0 means most values in the window are identical. | ||
| // If this point differs from the median at all, it's an outlier. | ||
| if (windowMad === 0) { | ||
| if (deviation > 0) { | ||
| result[i]!.value = Math.round(windowMedian) | ||
| result[i]!.hasAnomaly = true | ||
| } | ||
| continue | ||
|
Comment on lines
+65
to
+72
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The zero-MAD branch will erase real low-volume traffic. When |
||
| } | ||
|
|
||
| const count = affectedIndices.length | ||
| for (let i = 0; i < count; i++) { | ||
| const t = (i + 1) / (count + 1) | ||
| result[affectedIndices[i]!]!.value = Math.round(startVal + t * (endVal - startVal)) | ||
| result[affectedIndices[i]!]!.hasAnomaly = true | ||
| // Scale MAD to approximate standard deviation using the consistency | ||
| // constant 1.4826 (valid for normally distributed data). | ||
| // The resulting score is essentially "how many standard deviations | ||
| // away from the local median is this point?" | ||
| const score = deviation / (windowMad * 1.4826) | ||
|
|
||
| // If the score exceeds the threshold, replace with the median. | ||
| // This corrects the spike while preserving the surrounding trend. | ||
| if (score > threshold) { | ||
| result[i]!.value = Math.round(windowMedian) | ||
| result[i]!.hasAnomaly = true | ||
| } | ||
| } | ||
|
|
||
| return result as EvolutionData | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do not score boundary samples with truncated windows.
Line 33 says a
halfWindowof3uses3neighbours on each side, but Lines 50-52 clamp the first and last samples to shorter windows after Line 42 only validates the overall series length. That makes edge points easy false positives:100,100,100,100,100,100,200gets its last point flattened back to100because there is no right-hand context andwindowMadfalls to0. Skip indices that cannot form a full symmetric window, or handle boundaries explicitly.Suggested fix
Also applies to: 41-42, 48-52