Skip to content

Latest commit

 

History

History
1035 lines (654 loc) · 29.9 KB

File metadata and controls

1035 lines (654 loc) · 29.9 KB
import pandas as pd
import numpy as np
df = pd.read_csv('D:/Swapnil/Swapnil/Work/Python_Projects/AD_Click_Agency/ad_click_dataset.csv')
# Dropping 'id' and 'full_name' as they are not relevant for prediction
df.drop(['id', 'full_name'], axis=1, inplace=True)
# Checking for duplicates
duplicate_count = df.duplicated().sum()
print(f"Number of duplicates: {duplicate_count}")
Number of duplicates: 4589
# Removing duplicates if any
df = df.drop_duplicates()
# Checking for missing values
missing_values = df.isnull().sum()
print(f"Missing values per column:\n{missing_values}")
Missing values per column:
age                 1845
gender              2366
device_type         1315
ad_position         1318
browsing_history    2249
time_of_day         1257
click                  0
dtype: int64
# Imputing numerical values (age) with median
df['age'] = df['age'].fillna(df['age'].median())

# Imputing categorical values with mode
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])
df['device_type'] = df['device_type'].fillna(df['device_type'].mode()[0])
df['ad_position'] = df['ad_position'].fillna(df['ad_position'].mode()[0])
df['browsing_history'] = df['browsing_history'].fillna(df['browsing_history'].mode()[0])
df['time_of_day'] = df['time_of_day'].fillna(df['time_of_day'].mode()[0])
# Checking for missing values
missing_values = df.isnull().sum()
print(f"Missing values per column:\n{missing_values}")
Missing values per column:
age                 0
gender              0
device_type         0
ad_position         0
browsing_history    0
time_of_day         0
click               0
dtype: int64
import matplotlib.pyplot as plt
import seaborn as sns

# Age distribution
plt.figure(figsize=(8, 6))
sns.histplot(df['age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.show()
# Gender vs Clicks
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', hue='click', data=df)
plt.title('Gender vs Clicks')
plt.show()
# Device Type vs Clicks
plt.figure(figsize=(8, 6))
sns.countplot(x='device_type', hue='click', data=df)
plt.title('Device Type vs Clicks')
plt.show()
# Checking and ensuring correct data types
print(df.dtypes)

# Convert categorical columns to the correct data type if needed
df['gender'] = df['gender'].astype('category')
df['device_type'] = df['device_type'].astype('category')
df['ad_position'] = df['ad_position'].astype('category')
df['browsing_history'] = df['browsing_history'].astype('category')
df['time_of_day'] = df['time_of_day'].astype('category')
age                 float64
gender               object
device_type          object
ad_position          object
browsing_history     object
time_of_day          object
click                 int64
dtype: object
from sklearn.preprocessing import StandardScaler

# Scaling 'age' feature
scaler = StandardScaler()
df['age'] = scaler.fit_transform(df[['age']])
df = pd.get_dummies(df, drop_first=True)
from sklearn.model_selection import train_test_split

# Define features and target
X = df.drop('click', axis=1)
y = df['click']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))
Accuracy: 0.5548029556650246
              precision    recall  f1-score   support

           0       0.54      0.60      0.57       801
           1       0.57      0.51      0.54       823

    accuracy                           0.55      1624
   macro avg       0.56      0.56      0.55      1624
weighted avg       0.56      0.55      0.55      1624
from sklearn.ensemble import RandomForestClassifier

# Initialize and train Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_rf_pred = rf_model.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_rf_pred)}")
Random Forest Accuracy: 0.5474137931034483
from xgboost import XGBClassifier

# Initialize and train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_xgb_pred = xgb_model.predict(X_test)
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_xgb_pred)}")
---------------------------------------------------------------------------

ModuleNotFoundError                       Traceback (most recent call last)

Cell In[20], line 1
----> 1 from xgboost import XGBClassifier
      3 # Initialize and train XGBoost
      4 xgb_model = XGBClassifier(use_label_encoder=False)


ModuleNotFoundError: No module named 'xgboost'
!pip install xgboost
Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Requirement already satisfied: numpy in c:\users\swapn\appdata\local\programs\python\python312\lib\site-packages (from xgboost) (2.1.1)
Requirement already satisfied: scipy in c:\users\swapn\appdata\local\programs\python\python312\lib\site-packages (from xgboost) (1.14.1)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.8/124.9 MB 5.6 MB/s eta 0:00:23
   - -------------------------------------- 3.9/124.9 MB 12.4 MB/s eta 0:00:10
   -- ------------------------------------- 7.1/124.9 MB 13.6 MB/s eta 0:00:09
   --- ------------------------------------ 10.0/124.9 MB 13.5 MB/s eta 0:00:09
   ---- ----------------------------------- 12.6/124.9 MB 13.4 MB/s eta 0:00:09
   ----- ---------------------------------- 15.7/124.9 MB 13.6 MB/s eta 0:00:09
   ------ --------------------------------- 21.5/124.9 MB 15.6 MB/s eta 0:00:07
   -------- ------------------------------- 26.2/124.9 MB 16.4 MB/s eta 0:00:07
   --------- ------------------------------ 30.9/124.9 MB 17.2 MB/s eta 0:00:06
   ----------- ---------------------------- 36.7/124.9 MB 18.5 MB/s eta 0:00:05
   -------------- ------------------------- 43.8/124.9 MB 19.9 MB/s eta 0:00:05
   ---------------- ----------------------- 50.3/124.9 MB 20.7 MB/s eta 0:00:04
   ------------------ --------------------- 57.4/124.9 MB 21.8 MB/s eta 0:00:04
   -------------------- ------------------- 62.9/124.9 MB 22.3 MB/s eta 0:00:03
   ---------------------- ----------------- 69.2/124.9 MB 22.6 MB/s eta 0:00:03
   ------------------------ --------------- 76.5/124.9 MB 23.4 MB/s eta 0:00:03
   -------------------------- ------------- 81.5/124.9 MB 23.4 MB/s eta 0:00:02
   ---------------------------- ----------- 87.8/124.9 MB 23.7 MB/s eta 0:00:02
   ------------------------------ --------- 94.1/124.9 MB 24.0 MB/s eta 0:00:02
   -------------------------------- ------ 103.3/124.9 MB 25.0 MB/s eta 0:00:01
   ---------------------------------- ---- 109.6/124.9 MB 25.3 MB/s eta 0:00:01
   ------------------------------------ -- 117.2/124.9 MB 25.7 MB/s eta 0:00:01
   --------------------------------------  124.8/124.9 MB 26.2 MB/s eta 0:00:01
   --------------------------------------  124.8/124.9 MB 26.2 MB/s eta 0:00:01
   --------------------------------------  124.8/124.9 MB 26.2 MB/s eta 0:00:01
   --------------------------------------- 124.9/124.9 MB 23.5 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.1.1
from xgboost import XGBClassifier

# Initialize and train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False)
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_xgb_pred = xgb_model.predict(X_test)
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_xgb_pred)}")
XGBoost Accuracy: 0.6231527093596059


C:\Users\swapn\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py:158: UserWarning: [15:43:19] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0015a694724fa8361-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
from xgboost import XGBClassifier

# Initialize and train XGBoost without use_label_encoder
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_xgb_pred = xgb_model.predict(X_test)
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_xgb_pred)}")
XGBoost Accuracy: 0.6231527093596059
#OPTIMIZING

# Create interaction terms between 'device_type' and 'time_of_day'
df['device_time_interaction'] = (
    (df['device_type_Mobile'].astype(str) + "_Mobile_" + df['time_of_day'].astype(str)).replace('0_Mobile_', '') +
    (df['device_type_Tablet'].astype(str) + "_Tablet_" + df['time_of_day'].astype(str)).replace('0_Tablet_', '')
)

# Create interaction terms between 'ad_position' and 'browsing_history'
df['ad_browsing_interaction'] = (
    (df['ad_position_Side'].astype(str) + "_Side_" + df['browsing_history'].astype(str)).replace('0_Side_', '')
)
---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\indexes\base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:


File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()


File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()


File pandas\\_libs\\hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()


File pandas\\_libs\\hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()


KeyError: 'time_of_day'


The above exception was the direct cause of the following exception:


KeyError                                  Traceback (most recent call last)

Cell In[27], line 5
      1 #OPTIMIZING
      2 
      3 # Create interaction terms between 'device_type' and 'time_of_day'
      4 df['device_time_interaction'] = (
----> 5     (df['device_type_Mobile'].astype(str) + "_Mobile_" + df['time_of_day'].astype(str)).replace('0_Mobile_', '') +
      6     (df['device_type_Tablet'].astype(str) + "_Tablet_" + df['time_of_day'].astype(str)).replace('0_Tablet_', '')
      7 )
      9 # Create interaction terms between 'ad_position' and 'browsing_history'
     10 df['ad_browsing_interaction'] = (
     11     (df['ad_position_Side'].astype(str) + "_Side_" + df['browsing_history'].astype(str)).replace('0_Side_', '')
     12 )


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\pandas\core\indexes\base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)


KeyError: 'time_of_day'
print(df.columns)
Index(['age', 'click', 'gender_Male', 'gender_Non-Binary',
       'device_type_Mobile', 'device_type_Tablet', 'ad_position_Side',
       'ad_position_Top', 'browsing_history_Entertainment',
       'browsing_history_News', 'browsing_history_Shopping',
       'browsing_history_Social Media', 'time_of_day_Evening',
       'time_of_day_Morning', 'time_of_day_Night'],
      dtype='object')
# Combine device type and time of day columns to create interaction terms
df['device_time_interaction'] = (
    (df['device_type_Mobile'] * df['time_of_day_Morning']).astype(str) + "_Mobile_Morning_" +
    (df['device_type_Mobile'] * df['time_of_day_Evening']).astype(str) + "_Mobile_Evening_" +
    (df['device_type_Tablet'] * df['time_of_day_Morning']).astype(str) + "_Tablet_Morning_" +
    (df['device_type_Tablet'] * df['time_of_day_Night']).astype(str) + "_Tablet_Night_"
).replace('0', '')  # Remove 0 interactions
# Combine ad position and browsing history to create interaction terms
df['ad_browsing_interaction'] = (
    (df['ad_position_Side'] * df['browsing_history_Shopping']).astype(str) + "_Side_Shopping_" +
    (df['ad_position_Top'] * df['browsing_history_Social Media']).astype(str) + "_Top_Social_Media_" +
    (df['ad_position_Side'] * df['browsing_history_News']).astype(str) + "_Side_News_"
).replace('0', '')  # Remove 0 interactions
# Binning the 'age' column
df['age_group'] = pd.cut(df['age'], bins=[0, 20, 30, 40, 50, 100], labels=['Teen', 'Young Adult', 'Adult', 'Middle Aged', 'Senior'])
!pip install imbalanced-learn


from imblearn.over_sampling import SMOTE

# Define features and target
X = df.drop('click', axis=1)
y = df['click']

# Initialize SMOTE and resample
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Check class distribution after SMOTE
print(pd.Series(y_res).value_counts())
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Requirement already satisfied: numpy>=1.17.3 in c:\users\swapn\appdata\local\programs\python\python312\lib\site-packages (from imbalanced-learn) (2.1.1)
Requirement already satisfied: scipy>=1.5.0 in c:\users\swapn\appdata\local\programs\python\python312\lib\site-packages (from imbalanced-learn) (1.14.1)
Requirement already satisfied: scikit-learn>=1.0.2 in c:\users\swapn\appdata\local\programs\python\python312\lib\site-packages (from imbalanced-learn) (1.5.2)
Requirement already satisfied: joblib>=1.1.1 in c:\users\swapn\appdata\local\programs\python\python312\lib\site-packages (from imbalanced-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\swapn\appdata\local\programs\python\python312\lib\site-packages (from imbalanced-learn) (3.5.0)
Downloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.3



---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

Cell In[35], line 12
     10 # Initialize SMOTE and resample
     11 smote = SMOTE(random_state=42)
---> 12 X_res, y_res = smote.fit_resample(X, y)
     14 # Check class distribution after SMOTE
     15 print(pd.Series(y_res).value_counts())


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\imblearn\base.py:208, in BaseSampler.fit_resample(self, X, y)
    187 """Resample the dataset.
    188 
    189 Parameters
   (...)
    205     The corresponding label of `X_resampled`.
    206 """
    207 self._validate_params()
--> 208 return super().fit_resample(X, y)


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\imblearn\base.py:106, in SamplerMixin.fit_resample(self, X, y)
    104 check_classification_targets(y)
    105 arrays_transformer = ArraysTransformer(X, y)
--> 106 X, y, binarize_y = self._check_X_y(X, y)
    108 self.sampling_strategy_ = check_sampling_strategy(
    109     self.sampling_strategy, y, self._sampling_type
    110 )
    112 output = self._fit_resample(X, y)


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\imblearn\base.py:161, in BaseSampler._check_X_y(self, X, y, accept_sparse)
    159     accept_sparse = ["csr", "csc"]
    160 y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
--> 161 X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse)
    162 return X, y, binarize_y


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py:650, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, cast_to_ndarray, **check_params)
    648         y = check_array(y, input_name="y", **check_y_params)
    649     else:
--> 650         X, y = check_X_y(X, y, **check_params)
    651     out = X, y
    653 if not no_val_X and check_params.get("ensure_2d", True):


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py:1301, in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
   1296         estimator_name = _check_estimator_name(estimator)
   1297     raise ValueError(
   1298         f"{estimator_name} requires y to be passed, but the target y is None"
   1299     )
-> 1301 X = check_array(
   1302     X,
   1303     accept_sparse=accept_sparse,
   1304     accept_large_sparse=accept_large_sparse,
   1305     dtype=dtype,
   1306     order=order,
   1307     copy=copy,
   1308     force_writeable=force_writeable,
   1309     force_all_finite=force_all_finite,
   1310     ensure_2d=ensure_2d,
   1311     allow_nd=allow_nd,
   1312     ensure_min_samples=ensure_min_samples,
   1313     ensure_min_features=ensure_min_features,
   1314     estimator=estimator,
   1315     input_name="X",
   1316 )
   1318 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
   1320 check_consistent_length(X, y)


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py:1064, in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_writeable, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)
   1058     raise ValueError(
   1059         "Found array with dim %d. %s expected <= 2."
   1060         % (array.ndim, estimator_name)
   1061     )
   1063 if force_all_finite:
-> 1064     _assert_all_finite(
   1065         array,
   1066         input_name=input_name,
   1067         estimator_name=estimator_name,
   1068         allow_nan=force_all_finite == "allow-nan",
   1069     )
   1071 if copy:
   1072     if _is_numpy_namespace(xp):
   1073         # only make a copy if `array` and `array_orig` may share memory`


File ~\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\validation.py:108, in _assert_all_finite(X, allow_nan, msg_dtype, estimator_name, input_name)
    106 if not is_array_api and X.dtype == np.dtype("object") and not allow_nan:
    107     if _object_dtype_isnan(X).any():
--> 108         raise ValueError("Input contains NaN")
    110 # We need only consider float arrays, hence can early return for all else.
    111 if not xp.isdtype(X.dtype, ("real floating", "complex floating")):


ValueError: Input contains NaN
# Fill missing values in the 'age_group' column by re-binning the 'age' column
df['age_group'] = pd.cut(df['age'], bins=[0, 20, 30, 40, 50, 100], labels=['Teen', 'Young Adult', 'Adult', 'Middle Aged', 'Senior'])

# Verify missing values again
print(df.isnull().sum())

age                                  0
click                                0
gender_Male                          0
gender_Non-Binary                    0
device_type_Mobile                   0
device_type_Tablet                   0
ad_position_Side                     0
ad_position_Top                      0
browsing_history_Entertainment       0
browsing_history_News                0
browsing_history_Shopping            0
browsing_history_Social Media        0
time_of_day_Evening                  0
time_of_day_Morning                  0
time_of_day_Night                    0
device_time_interaction              0
ad_browsing_interaction              0
age_group                         3686
dtype: int64
# Drop the 'age_group' column
df.drop('age_group', axis=1, inplace=True)

# Verify that the column is dropped
print(df.columns)
Index(['age', 'click', 'gender_Male', 'gender_Non-Binary',
       'device_type_Mobile', 'device_type_Tablet', 'ad_position_Side',
       'ad_position_Top', 'browsing_history_Entertainment',
       'browsing_history_News', 'browsing_history_Shopping',
       'browsing_history_Social Media', 'time_of_day_Evening',
       'time_of_day_Morning', 'time_of_day_Night', 'device_time_interaction',
       'ad_browsing_interaction'],
      dtype='object')
# Drop the interaction columns that contain strings
df.drop(['device_time_interaction', 'ad_browsing_interaction'], axis=1, inplace=True)



from imblearn.over_sampling import SMOTE

# Define features and target
X = df.drop('click', axis=1)
y = df['click']

# Initialize SMOTE and resample
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Check class distribution after SMOTE
print(pd.Series(y_res).value_counts())
click
1    2735
0    2735
Name: count, dtype: int64
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train logistic regression model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_res, y_res)

# Predict on test data
y_log_pred = log_model.predict(X_test)

# Evaluate the model
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_log_pred)}")
print(classification_report(y_test, y_log_pred))
Logistic Regression Accuracy: 0.5634236453201971
              precision    recall  f1-score   support

           0       0.56      0.55      0.55       801
           1       0.57      0.57      0.57       823

    accuracy                           0.56      1624
   macro avg       0.56      0.56      0.56      1624
weighted avg       0.56      0.56      0.56      1624
from sklearn.ensemble import RandomForestClassifier

# Initialize and train Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_res, y_res)

# Predict on test data
y_rf_pred = rf_model.predict(X_test)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_rf_pred)}")
Random Forest Accuracy: 0.8300492610837439
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.001, 0.01, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize XGBoost model
xgb_model = XGBClassifier()

# Randomized search
random_search = RandomizedSearchCV(xgb_model, param_grid, n_iter=10, scoring='accuracy', cv=5, random_state=42, n_jobs=-1)
random_search.fit(X_res, y_res)

# Best parameters
print(f"Best parameters: {random_search.best_params_}")

# Predict using the best model
y_xgb_pred = random_search.best_estimator_.predict(X_test)
print(f"Optimized XGBoost Accuracy: {accuracy_score(y_test, y_xgb_pred)}")
Best parameters: {'subsample': 1.0, 'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
Optimized XGBoost Accuracy: 0.7561576354679803
import matplotlib.pyplot as plt
import seaborn as sns

# Class distribution before SMOTE
plt.figure(figsize=(6,4))
sns.countplot(x=df['click'])
plt.title("Class Distribution Before SMOTE")
plt.show()

# Class distribution after SMOTE
plt.figure(figsize=(6,4))
sns.countplot(x=y_res)
plt.title("Class Distribution After SMOTE")
plt.show()
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Predictions for logistic regression
y_log_pred = log_model.predict(X_test)

# Calculate metrics for logistic regression
log_accuracy = accuracy_score(y_test, y_log_pred)
log_precision = precision_score(y_test, y_log_pred)
log_recall = recall_score(y_test, y_log_pred)

print(f"Logistic Regression - Accuracy: {log_accuracy}, Precision: {log_precision}, Recall: {log_recall}")



# Predictions for random forest
y_rf_pred = rf_model.predict(X_test)

# Calculate metrics for random forest
rf_accuracy = accuracy_score(y_test, y_rf_pred)
rf_precision = precision_score(y_test, y_rf_pred)
rf_recall = recall_score(y_test, y_rf_pred)

print(f"Random Forest - Accuracy: {rf_accuracy}, Precision: {rf_precision}, Recall: {rf_recall}")



# Predictions for XGBoost
y_xgb_pred = random_search.best_estimator_.predict(X_test)

# Calculate metrics for XGBoost
xgb_accuracy = accuracy_score(y_test, y_xgb_pred)
xgb_precision = precision_score(y_test, y_xgb_pred)
xgb_recall = recall_score(y_test, y_xgb_pred)

print(f"XGBoost - Accuracy: {xgb_accuracy}, Precision: {xgb_precision}, Recall: {xgb_recall}")




# Define the metrics for each model
model_names = ['Logistic Regression', 'Random Forest', 'XGBoost']
accuracies = [log_accuracy, rf_accuracy, xgb_accuracy]
precisions = [log_precision, rf_precision, xgb_precision]
recalls = [log_recall, rf_recall, xgb_recall]

# Plot accuracy comparison
plt.figure(figsize=(8, 6))
plt.bar(model_names, accuracies, color='skyblue')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.show()

# Plot precision comparison
plt.figure(figsize=(8, 6))
plt.bar(model_names, precisions, color='lightgreen')
plt.title('Model Precision Comparison')
plt.ylabel('Precision')
plt.show()

# Plot recall comparison
plt.figure(figsize=(8, 6))
plt.bar(model_names, recalls, color='salmon')
plt.title('Model Recall Comparison')
plt.ylabel('Recall')
plt.show()


Logistic Regression - Accuracy: 0.5634236453201971, Precision: 0.5685096153846154, Recall: 0.574726609963548
Random Forest - Accuracy: 0.8300492610837439, Precision: 0.8132875143184422, Recall: 0.8626974483596598
XGBoost - Accuracy: 0.7561576354679803, Precision: 0.7217030114226376, Recall: 0.8444714459295262
from sklearn.metrics import plot_confusion_matrix

# Assuming you have the models already trained and tested
plt.figure(figsize=(8, 6))
plot_confusion_matrix(log_model, X_test, y_test)
plt.title('Confusion Matrix for Logistic Regression')
plt.show()

plt.figure(figsize=(8, 6))
plot_confusion_matrix(rf_model, X_test, y_test)
plt.title('Confusion Matrix for Random Forest')
plt.show()

plt.figure(figsize=(8, 6))
plot_confusion_matrix(random_search.best_estimator_, X_test, y_test)
plt.title('Confusion Matrix for XGBoost')
plt.show()
---------------------------------------------------------------------------

ImportError                               Traceback (most recent call last)

Cell In[47], line 1
----> 1 from sklearn.metrics import plot_confusion_matrix
      3 # Assuming you have the models already trained and tested
      4 plt.figure(figsize=(8, 6))


ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (C:\Users\swapn\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\metrics\__init__.py)
from sklearn.metrics import ConfusionMatrixDisplay

# Logistic Regression
plt.figure(figsize=(8, 6))
ConfusionMatrixDisplay.from_estimator(log_model, X_test, y_test)
plt.title('Confusion Matrix for Logistic Regression')
plt.show()

# Random Forest
plt.figure(figsize=(8, 6))
ConfusionMatrixDisplay.from_estimator(rf_model, X_test, y_test)
plt.title('Confusion Matrix for Random Forest')
plt.show()

# XGBoost
plt.figure(figsize=(8, 6))
ConfusionMatrixDisplay.from_estimator(random_search.best_estimator_, X_test, y_test)
plt.title('Confusion Matrix for XGBoost')
plt.show()
<Figure size 800x600 with 0 Axes>









<Figure size 800x600 with 0 Axes>









<Figure size 800x600 with 0 Axes>
from sklearn.metrics import roc_curve, auc

# For Logistic Regression
log_fpr, log_tpr, _ = roc_curve(y_test, log_model.predict_proba(X_test)[:,1])
log_auc = auc(log_fpr, log_tpr)

# For Random Forest
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_model.predict_proba(X_test)[:,1])
rf_auc = auc(rf_fpr, rf_tpr)

# For XGBoost
xgb_fpr, xgb_tpr, _ = roc_curve(y_test, random_search.best_estimator_.predict_proba(X_test)[:,1])
xgb_auc = auc(xgb_fpr, xgb_tpr)

# Plot ROC curve
plt.figure(figsize=(8,6))
plt.plot(log_fpr, log_tpr, label=f'Logistic Regression (AUC = {log_auc:.2f})')
plt.plot(rf_fpr, rf_tpr, label=f'Random Forest (AUC = {rf_auc:.2f})')
plt.plot(xgb_fpr, xgb_tpr, label=f'XGBoost (AUC = {xgb_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()