Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/data-analytics-demo/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ venv/
dbt_project/target/
dbt_project/dbt_packages/
dbt_project/logs/
dbt_project/.user.yml
4 changes: 2 additions & 2 deletions packages/data-analytics-demo/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ data:
$(PYTHON) -m data_analytics_demo.data.generate

dbt:
@echo "[dbt] TODO T-04/T-05: dbt models not yet implemented"
@exit 1
cd dbt_project && DBT_PROFILES_DIR=. dbt run
cd dbt_project && DBT_PROFILES_DIR=. dbt test

ml:
@echo "[ml] TODO T-06/T-07: ML pipelines not yet implemented"
Expand Down
25 changes: 25 additions & 0 deletions packages/data-analytics-demo/dbt_project/dbt_project.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: data_analytics_demo
version: 0.1.0
config-version: 2

profile: data_analytics_demo

model-paths: ["models"]
analysis-paths: ["analyses"]
test-paths: ["tests"]
seed-paths: ["seeds"]
macro-paths: ["macros"]
snapshot-paths: ["snapshots"]

clean-targets:
- target
- dbt_packages

models:
data_analytics_demo:
staging:
+materialized: view
intermediate:
+materialized: view
marts:
+materialized: table
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{{ config(materialized='view') }}

-- Per-customer base features built once and reused by all mart models.
-- Joins customers + their lifetime event activity + total monetary value.

with customer_event_stats as (
select
customer_id,
count(*) as event_count_total,
count(distinct event_type) as distinct_event_types,
min(event_at) as first_event_at,
max(event_at) as last_event_at,
date_diff('day', min(event_at), max(event_at)) + 1 as active_days
from {{ ref('stg_events') }}
group by customer_id
),

customer_invoice_stats as (
select
customer_id,
sum(case when status = 'paid' then amount_usd else 0 end) as lifetime_paid_usd,
sum(case when status = 'failed' then 1 else 0 end) as failed_invoice_count,
count(*) as invoice_count
from {{ ref('stg_invoices') }}
group by customer_id
),

latest_subscription as (
select
customer_id,
plan_tier as current_plan_tier,
status as current_status,
row_number() over (partition by customer_id order by start_date desc) as rn
from {{ ref('stg_subscriptions') }}
qualify rn = 1
)

select
c.customer_id,
c.email,
c.company,
c.signup_date,
c.region,
c.plan_tier_at_signup,
coalesce(ls.current_plan_tier, c.plan_tier_at_signup) as current_plan_tier,
coalesce(ls.current_status, 'unknown') as current_status,
coalesce(ces.event_count_total, 0) as event_count_total,
coalesce(ces.distinct_event_types, 0) as distinct_event_types,
ces.first_event_at,
ces.last_event_at,
coalesce(ces.active_days, 0) as active_days,
coalesce(cis.lifetime_paid_usd, 0) as lifetime_paid_usd,
coalesce(cis.failed_invoice_count, 0) as failed_invoice_count,
coalesce(cis.invoice_count, 0) as invoice_count
from {{ ref('stg_customers') }} c
left join customer_event_stats ces on c.customer_id = ces.customer_id
left join customer_invoice_stats cis on c.customer_id = cis.customer_id
left join latest_subscription ls on c.customer_id = ls.customer_id
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{{ config(materialized='view') }}

-- Per-customer × event_type counts. Powers both the upsell mart
-- (premium / advanced feature usage) and the churn mart (support
-- ticket volume, recent activity).

select
customer_id,
event_type,
count(*) as event_count,
min(event_at) as first_event_at,
max(event_at) as last_event_at
from {{ ref('stg_events') }}
group by customer_id, event_type
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{{ config(materialized='table') }}

-- Churn-prediction feature table. One row per customer.
-- Label: is_churned = 1 when the customer's latest subscription is canceled.
-- Feature engineering deliberately mirrors the synthetic-data churn signal
-- (trailing-30d event drop-off vs lifetime daily average).

with reference_point as (
select max(event_at)::date as as_of_date from {{ ref('stg_events') }}
),

trailing_30d as (
select
e.customer_id,
count(*) as events_last_30d
from {{ ref('stg_events') }} e
cross join reference_point r
where e.event_at >= r.as_of_date - interval 30 day
group by e.customer_id
),

support_volume as (
select
customer_id,
sum(event_count) as support_ticket_count
from {{ ref('int_event_aggregates') }}
where event_type = 'support_ticket'
group by customer_id
)

select
f.customer_id,
f.plan_tier_at_signup,
f.current_plan_tier,
f.region,
f.event_count_total,
f.distinct_event_types,
f.lifetime_paid_usd,
f.failed_invoice_count,
f.invoice_count,
coalesce(t.events_last_30d, 0) as events_last_30d,
-- Daily lifetime average; guards against divide-by-zero with NULLIF.
f.event_count_total::double / NULLIF(f.active_days, 0) as lifetime_daily_avg_events,
-- Trailing-30d rate vs lifetime daily avg. < 1.0 means slowing down.
case
when f.active_days > 0 and f.event_count_total > 0
then (coalesce(t.events_last_30d, 0) / 30.0)
/ (f.event_count_total::double / f.active_days)
else null
end as recent_to_lifetime_ratio,
coalesce(s.support_ticket_count, 0) as support_ticket_count,
case when f.current_status = 'canceled' then 1 else 0 end as is_churned
from {{ ref('int_customer_features') }} f
left join trailing_30d t on f.customer_id = t.customer_id
left join support_volume s on f.customer_id = s.customer_id
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{{ config(materialized='table') }}

-- Monthly signup cohort × months-since-signup retention grid.
-- "Active" at offset M = the customer emitted at least one event in the
-- month starting M months after signup. Cohort size is the count of
-- customers in the signup month.

with customer_signup as (
select
customer_id,
date_trunc('month', signup_date) as cohort_month
from {{ ref('stg_customers') }}
),

monthly_activity as (
select distinct
customer_id,
date_trunc('month', event_at) as active_month
from {{ ref('stg_events') }}
),

cohort_offsets as (
select
c.cohort_month,
c.customer_id,
date_diff('month', c.cohort_month, m.active_month) as months_since_signup
from customer_signup c
join monthly_activity m on c.customer_id = m.customer_id
where m.active_month >= c.cohort_month
),

cohort_sizes as (
select cohort_month, count(distinct customer_id) as cohort_size
from customer_signup
group by cohort_month
)

select
o.cohort_month,
cs.cohort_size,
o.months_since_signup,
count(distinct o.customer_id) as active_customers,
round(count(distinct o.customer_id) * 100.0 / cs.cohort_size, 2) as retention_pct
from cohort_offsets o
join cohort_sizes cs on o.cohort_month = cs.cohort_month
group by o.cohort_month, cs.cohort_size, o.months_since_signup
order by o.cohort_month, o.months_since_signup
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{{ config(materialized='table') }}

-- RFM segmentation built from event recency, event frequency, and lifetime
-- paid amount. Quintile scores in {1, 2, 3, 4, 5} on each axis; the
-- composite label maps the (R, F, M) triple to a coarse 4-bucket segment.
--
-- Reference window: max(event_at) across all customers (so the data set
-- self-anchors and the mart is reproducible for any synthetic seed).

with reference_point as (
select max(event_at)::date as as_of_date from {{ ref('stg_events') }}
),

rfm_raw as (
select
f.customer_id,
date_diff(
'day',
cast(f.last_event_at as date),
(select as_of_date from reference_point)
) as recency_days,
f.event_count_total as frequency_events,
f.lifetime_paid_usd as monetary_usd
from {{ ref('int_customer_features') }} f
where f.last_event_at is not null
),

rfm_scored as (
select
customer_id,
recency_days,
frequency_events,
monetary_usd,
-- Recency: lower is better, so reverse the quintile.
6 - ntile(5) over (order by recency_days) as r_score,
ntile(5) over (order by frequency_events) as f_score,
ntile(5) over (order by monetary_usd) as m_score
from rfm_raw
)

select
customer_id,
recency_days,
frequency_events,
monetary_usd,
r_score,
f_score,
m_score,
case
when r_score >= 4 and f_score >= 4 and m_score >= 4 then 'champions'
when r_score >= 4 and f_score >= 3 then 'loyal'
when r_score >= 3 and m_score >= 4 then 'big_spenders'
when r_score <= 2 and f_score <= 2 then 'at_risk'
else 'regular'
end as rfm_segment
from rfm_scored
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
version: 2

models:
- name: rfm_segments
description: RFM (Recency / Frequency / Monetary) segmentation, one row per active customer.
columns:
- name: customer_id
description: Primary key; foreign key to stg_customers.
tests:
- not_null
- unique
- name: r_score
description: Recency score in {1..5}; 5 = most recent.
tests:
- not_null
- name: f_score
description: Frequency score in {1..5}.
tests:
- not_null
- name: m_score
description: Monetary score in {1..5}.
tests:
- not_null
- name: rfm_segment
description: Coarse 5-bucket label derived from the scores.
tests:
- not_null
- accepted_values:
arguments:
values:
["champions", "loyal", "big_spenders", "at_risk", "regular"]

- name: churn_features
description: One-row-per-customer feature table for churn prediction.
columns:
- name: customer_id
tests:
- not_null
- unique
- name: is_churned
description: Binary label (1 if latest subscription canceled).
tests:
- not_null
- accepted_values:
arguments:
values: [0, 1]

- name: upsell_opportunities
description: One-row-per-eligible-customer feature table for upsell propensity.
columns:
- name: customer_id
tests:
- not_null
- unique
- name: upgraded
description: Binary label (1 if customer reached a tier higher than initial plan).
tests:
- not_null
- accepted_values:
arguments:
values: [0, 1]
- name: plan_tier_at_signup
tests:
- not_null
- accepted_values:
arguments:
values: ["free", "pro"]

- name: cohort_retention
description: Monthly cohort × months-since-signup retention grid.
columns:
- name: cohort_month
tests:
- not_null
- name: months_since_signup
tests:
- not_null
- name: retention_pct
description: 0–100 retention percentage for this (cohort, offset) cell.
tests:
- not_null
Loading
Loading