Team-16-Machine-Learning-Project/phase2.py at main · derekogorry/Team-16-Machine-Learning-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# STEP 2: EXPLORATORY DATA ANALYSIS (EDA)
# EDA: looking at our data carefully before modeling.
# Note to my partners** This step does NOT change any data — it only looks at it

# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns     # more visually appealing than matplotlib
import joblib

# LOAD DATASETS FROM BASE

# read file saved in step 1 and gives us back dictionary containing both DataFrames
datasets = joblib.load("raw_datasets.pkl")
ames_df   = datasets["ames"]       # Big, detailed Ames dataset
kaggle_df = datasets["kaggle"]     # simpler Kaggle dataset (may be None)

print("Datasets loaded successfully.")
print(f"Ames: {ames_df.shape}   |   Kaggle: {kaggle_df.shape if kaggle_df is not None else 'not loaded'}")

# PART 1: MISSING VALUES IN AMES=

print("\n" + "=" * 60)
print("PART 1: Missing Values (Ames dataset)")
print("=" * 60)

# isnull() returns True when a value is missing.
# .mean() converts True/False to 1/0 and averages them, giving the fraction missing;
# Multiplying by 100 converts to percentage
missing_pct = (ames_df.isnull().mean() * 100).sort_values(ascending=False)

# Only show columns that actually have missing values (> 0%)
missing_pct = missing_pct[missing_pct > 0]
print(f"\n{len(missing_pct)} columns have missing values:")
print(missing_pct.round(1).to_string())  # .round(1) limits to 1 decimal place

# Visualize missing values as a bar chart
plt.figure(figsize=(10, 6))
missing_pct.head(20).plot(kind="bar", color="#7F77DD")  # top 20 worst columns
plt.title("Top 20 columns with missing values (Ames dataset)")
plt.xlabel("Column name")
plt.ylabel("% of values missing")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig("eda_missing_values.png", dpi=120)
plt.show()
print("Chart saved: eda_missing_values.png")


# PART 2: TARGET VARIABLE — SALE PRICE DISTRIBUTION
print("\n" + "=" * 60)
print("PART 2: Sale Price Distribution")
print("=" * 60)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left chart: raw price distribution
ames_df["SalePrice"].plot(
    kind="hist", bins=50, ax=axes[0],
    color="#1D9E75", edgecolor="white"
)
axes[0].set_title("Raw SalePrice — right-skewed")
axes[0].set_xlabel("Sale Price (USD)")
axes[0].set_ylabel("Number of houses")

# Right chart: log-transformed price
# np.log1p(x) = log(x + 1) — the +1 prevents issues if any value is 0
np.log1p(ames_df["SalePrice"]).plot(
    kind="hist", bins=50, ax=axes[1],
    color="#378ADD", edgecolor="white"
)
axes[1].set_title("log(SalePrice) — much more symmetric")
axes[1].set_xlabel("log(Sale Price)")
axes[1].set_ylabel("Number of houses")

plt.suptitle("Price distribution before and after log transform", fontsize=13)
plt.tight_layout()
plt.savefig("eda_price_distribution.png", dpi=120)
plt.show()
print("Chart saved: eda_price_distribution.png")
print("\nConclusion: We will log-transform SalePrice before training all models.")

# PART 3: FEATURE CORRELATIONS WITH SALE PRICE
# Range: -1 (opposite) to +1 (perfectly in sync). 0 = no relationship.
print("\n" + "=" * 60)
print("PART 3: Feature Correlations with SalePrice")
print("=" * 60)

# Only numeric columns
numeric_df = ames_df.select_dtypes(include=[np.number])

# .corr() computes pairwise correlations between all numeric columns
corr_matrix = numeric_df.corr()

# Pull out just correlations with SalePrice, sort high to low, drop SalePrice itself
correlations = (
    corr_matrix["SalePrice"]
    .drop("SalePrice")          # remove SalePrice's correlation with itself (always 1.0)
    .sort_values(ascending=False)
)

print("\nTop 10 features most positively correlated with SalePrice:")
print(correlations.head(10).round(3))   # round to 3 dp for ease

print("\nTop 5 features most negatively correlated with SalePrice:")
print(correlations.tail(5).round(3))

# Correlation heatmap for the top features
top_10_features = correlations.abs().nlargest(10).index.tolist()
top_10_features.append("SalePrice")  # add the target back in so it shows on the heatmap

plt.figure(figsize=(11, 9))
sns.heatmap(
    numeric_df[top_10_features].corr(),
    annot=True,        # write correlation number in each cell
    fmt=".2f",         # format numbers to 2 dp
    cmap="coolwarm",   # blue = negative correlation, red = positive
    center=0           # center the color scale at 0
)
plt.title("Correlation heatmap — top 10 features vs SalePrice")
plt.tight_layout()
plt.savefig("eda_correlation_heatmap.png", dpi=120)
plt.show()
print("Chart saved: eda_correlation_heatmap.png")

# PART 4: SCATTER PLOTS — KEY FEATURES VS PRICE
print("\n" + "=" * 60)
print("PART 4: Scatter Plots — Key Features vs SalePrice")
print("=" * 60)

# These are 4 features usually seen as important for house prices
key_features = ["GrLivArea", "TotalBsmtSF", "GarageArea", "YearBuilt"]

fig, axes = plt.subplots(2, 2, figsize=(13, 10))
for ax, feature in zip(axes.flatten(), key_features):
    if feature in ames_df.columns:
        ax.scatter(
            ames_df[feature],
            ames_df["SalePrice"],
            alpha=0.3,   # transparency so we can see overlapping points
            s=10,        # point size
            color="#1D9E75"
        )
        ax.set_xlabel(feature)
        ax.set_ylabel("SalePrice ($)")
        ax.set_title(f"{feature} vs SalePrice")

plt.suptitle("Key feature relationships with sale price", fontsize=13)
plt.tight_layout()
plt.savefig("eda_scatter_plots.png", dpi=120)
plt.show()
print("Chart saved: eda_scatter_plots.png")

# PART 5: COMPARE AMES vs KAGGLE — SHARED FEATURES
# Since the two datasets can't be merged (different currencies/markets),
# we're looking for common features to apply our Ames-trained model
# for Kaggle generalization.

print("\n" + "=" * 60)
print("PART 5: Ames vs Kaggle — Feature Comparison")
print("=" * 60)

if kaggle_df is not None:
    print("\nKaggle dataset columns:")
    print(kaggle_df.columns.tolist())

    # The Kaggle dataset has these columns (after standard name cleaning):
    # price, area, bedrooms, bathrooms, stories, mainroad, guestroom,
    # basement, hotwaterheating, airconditioning, parking, prefarea, furnishingstatus

    # These are the Kaggle features that have rough equivalents in Ames:
    # Kaggle "area"      ≈ Ames "GrLivArea"    (above-ground living area sq ft)
    # Kaggle "bedrooms"  ≈ Ames "BedroomAbvGr" (bedrooms above ground)
    # Kaggle "bathrooms" ≈ Ames "FullBath"     (full bathrooms)
    # Kaggle "stories"   ≈ Ames "MSSubClass"   (approximate — not perfect)
    # Kaggle "parking"   ≈ Ames "GarageCars"   (garage capacity)

    # We define the mapping and save
    FEATURE_MAPPING = {
        # kaggle_column : ames_column
        "area":      "GrLivArea",
        "bedrooms":  "BedroomAbvGr",
        "bathrooms": "FullBath",
        "parking":   "GarageCars",
    }
    # *Note to my partners: We intentionally kept only numeric columns with clear parallels.
    # "stories" was excluded because Ames didn't have a direct match.
    # Categorical columns (mainroad, guestroom, etc.) have no Ames counterpart.

    print(f"\nFeature mapping (Kaggle → Ames equivalents):")
    for k, v in FEATURE_MAPPING.items():
        print(f"  Kaggle '{k}' ≈ Ames '{v}'")

    print(f"\nWe will train on Ames's ~80 features, then apply the model to these")
    print(f"4 shared features in the Kaggle dataset as a generalization test.")

    # Saved the feature mapping so step 6 can load it
    joblib.dump(FEATURE_MAPPING, "feature_mapping.pkl")
    print("Saved: feature_mapping.pkl")
else:
    print("Kaggle dataset not loaded — skipping comparison.")

print("\nStep 2 complete! Proceed to step3_preprocessing.py")