| """
|
| Demand Prediction System - Model Training Script
|
|
|
| This script trains multiple machine learning and time-series models to predict
|
| product demand (sales quantity) for an e-commerce platform.
|
|
|
| Features:
|
| - Data preprocessing and feature engineering
|
| - Date feature extraction (day, month, day_of_week, weekend)
|
| - Categorical encoding
|
| - Feature scaling
|
| - Multiple ML models (Linear Regression, Random Forest, XGBoost)
|
| - Time-series models (ARIMA, Prophet)
|
| - Model evaluation (MAE, RMSE, R2 Score)
|
| - Automatic best model selection
|
| - Model persistence using joblib
|
| - Visualization of results
|
| - Comparison between ML and time-series approaches
|
| """
|
|
|
| import pandas as pd
|
| import numpy as np
|
| import matplotlib.pyplot as plt
|
| import seaborn as sns
|
| from datetime import datetime
|
| import joblib
|
| import os
|
| import warnings
|
| warnings.filterwarnings('ignore')
|
|
|
|
|
| from sklearn.model_selection import train_test_split
|
| from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| from sklearn.linear_model import LinearRegression
|
| from sklearn.ensemble import RandomForestRegressor
|
| from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
|
|
|
|
| try:
|
| import xgboost as xgb
|
| XGBOOST_AVAILABLE = True
|
| except ImportError:
|
| XGBOOST_AVAILABLE = False
|
| print("XGBoost not available. Install with: pip install xgboost")
|
|
|
|
|
| try:
|
| from statsmodels.tsa.arima.model import ARIMA
|
| from statsmodels.tsa.stattools import adfuller
|
| ARIMA_AVAILABLE = True
|
| except ImportError:
|
| ARIMA_AVAILABLE = False
|
| print("statsmodels not available. Install with: pip install statsmodels")
|
|
|
| try:
|
| from prophet import Prophet
|
| PROPHET_AVAILABLE = True
|
| except ImportError:
|
| PROPHET_AVAILABLE = False
|
| print("Prophet not available. Install with: pip install prophet")
|
|
|
|
|
| np.random.seed(42)
|
|
|
|
|
| DATA_PATH = 'data/sales.csv'
|
| MODEL_DIR = 'models'
|
| PLOTS_DIR = 'plots'
|
|
|
|
|
| os.makedirs(MODEL_DIR, exist_ok=True)
|
| os.makedirs(PLOTS_DIR, exist_ok=True)
|
|
|
|
|
| def load_data(file_path):
|
| """
|
| Load the sales dataset from CSV file.
|
|
|
| Args:
|
| file_path: Path to the CSV file
|
|
|
| Returns:
|
| DataFrame: Loaded dataset
|
| """
|
| print(f"Loading data from {file_path}...")
|
| df = pd.read_csv(file_path)
|
| print(f"Data loaded successfully! Shape: {df.shape}")
|
| return df
|
|
|
|
|
| def preprocess_data(df):
|
| """
|
| Preprocess the data: convert date, extract features, handle missing values.
|
|
|
| Args:
|
| df: Raw DataFrame
|
|
|
| Returns:
|
| DataFrame: Preprocessed DataFrame
|
| """
|
| print("\n" + "="*60)
|
| print("PREPROCESSING DATA")
|
| print("="*60)
|
|
|
|
|
| df = df.copy()
|
|
|
|
|
| df['date'] = pd.to_datetime(df['date'])
|
|
|
|
|
| print("Extracting date features...")
|
| df['day'] = df['date'].dt.day
|
| df['month'] = df['date'].dt.month
|
| df['day_of_week'] = df['date'].dt.dayofweek
|
| df['weekend'] = (df['day_of_week'] >= 5).astype(int)
|
| df['year'] = df['date'].dt.year
|
| df['quarter'] = df['date'].dt.quarter
|
|
|
|
|
| print("\nMissing values:")
|
| missing = df.isnull().sum()
|
| print(missing[missing > 0])
|
|
|
| if missing.sum() > 0:
|
| print("Filling missing values...")
|
| df = df.fillna(df.median(numeric_only=True))
|
|
|
|
|
| print("\nDataset Info:")
|
| print(f"Shape: {df.shape}")
|
| print(f"\nColumns: {df.columns.tolist()}")
|
| print(f"\nData types:\n{df.dtypes}")
|
| print(f"\nBasic statistics:\n{df.describe()}")
|
|
|
| return df
|
|
|
|
|
| def feature_engineering(df):
|
| """
|
| Perform feature engineering: encode categorical variables, scale features.
|
|
|
| Args:
|
| df: Preprocessed DataFrame
|
|
|
| Returns:
|
| tuple: (X_features, y_target, feature_names, encoders, scaler)
|
| """
|
| print("\n" + "="*60)
|
| print("FEATURE ENGINEERING")
|
| print("="*60)
|
|
|
|
|
|
|
|
|
| feature_columns = ['product_id', 'price', 'discount', 'category',
|
| 'day', 'month', 'day_of_week', 'weekend', 'year', 'quarter']
|
|
|
| X = df[feature_columns].copy()
|
| y = df['sales_quantity'].copy()
|
|
|
|
|
| print("Encoding categorical variables...")
|
|
|
|
|
| category_encoder = LabelEncoder()
|
| X['category_encoded'] = category_encoder.fit_transform(X['category'])
|
|
|
|
|
| product_encoder = LabelEncoder()
|
| X['product_id_encoded'] = product_encoder.fit_transform(X['product_id'])
|
|
|
|
|
| X = X.drop(['category', 'product_id'], axis=1)
|
|
|
|
|
| feature_names = X.columns.tolist()
|
|
|
| print(f"Features after encoding: {feature_names}")
|
| print(f"Number of features: {len(feature_names)}")
|
|
|
|
|
| print("\nScaling numerical features...")
|
| scaler = StandardScaler()
|
| X_scaled = scaler.fit_transform(X)
|
| X_scaled = pd.DataFrame(X_scaled, columns=feature_names)
|
|
|
|
|
| encoders = {
|
| 'category': category_encoder,
|
| 'product_id': product_encoder,
|
| 'scaler': scaler
|
| }
|
|
|
| return X_scaled, y, feature_names, encoders, scaler
|
|
|
|
|
| def train_models(X_train, y_train, X_val, y_val):
|
| """
|
| Train multiple models and return their performance metrics.
|
|
|
| Args:
|
| X_train: Training features
|
| y_train: Training target
|
| X_val: Validation features
|
| y_val: Validation target
|
|
|
| Returns:
|
| dict: Dictionary containing models and their metrics
|
| """
|
| print("\n" + "="*60)
|
| print("TRAINING MODELS")
|
| print("="*60)
|
|
|
| models = {}
|
| results = {}
|
|
|
|
|
| print("\n1. Training Linear Regression...")
|
| lr_model = LinearRegression()
|
| lr_model.fit(X_train, y_train)
|
| lr_pred = lr_model.predict(X_val)
|
|
|
| lr_mae = mean_absolute_error(y_val, lr_pred)
|
| lr_rmse = np.sqrt(mean_squared_error(y_val, lr_pred))
|
| lr_r2 = r2_score(y_val, lr_pred)
|
|
|
| models['Linear Regression'] = lr_model
|
| results['Linear Regression'] = {
|
| 'model': lr_model,
|
| 'mae': lr_mae,
|
| 'rmse': lr_rmse,
|
| 'r2': lr_r2,
|
| 'predictions': lr_pred
|
| }
|
|
|
| print(f" MAE: {lr_mae:.2f}, RMSE: {lr_rmse:.2f}, R2: {lr_r2:.4f}")
|
|
|
|
|
| print("\n2. Training Random Forest Regressor...")
|
| rf_model = RandomForestRegressor(
|
| n_estimators=100,
|
| max_depth=15,
|
| min_samples_split=5,
|
| min_samples_leaf=2,
|
| random_state=42,
|
| n_jobs=-1
|
| )
|
| rf_model.fit(X_train, y_train)
|
| rf_pred = rf_model.predict(X_val)
|
|
|
| rf_mae = mean_absolute_error(y_val, rf_pred)
|
| rf_rmse = np.sqrt(mean_squared_error(y_val, rf_pred))
|
| rf_r2 = r2_score(y_val, rf_pred)
|
|
|
| models['Random Forest'] = rf_model
|
| results['Random Forest'] = {
|
| 'model': rf_model,
|
| 'mae': rf_mae,
|
| 'rmse': rf_rmse,
|
| 'r2': rf_r2,
|
| 'predictions': rf_pred
|
| }
|
|
|
| print(f" MAE: {rf_mae:.2f}, RMSE: {rf_rmse:.2f}, R2: {rf_r2:.4f}")
|
|
|
|
|
| if XGBOOST_AVAILABLE:
|
| print("\n3. Training XGBoost Regressor...")
|
| xgb_model = xgb.XGBRegressor(
|
| n_estimators=100,
|
| max_depth=6,
|
| learning_rate=0.1,
|
| random_state=42,
|
| n_jobs=-1
|
| )
|
| xgb_model.fit(X_train, y_train)
|
| xgb_pred = xgb_model.predict(X_val)
|
|
|
| xgb_mae = mean_absolute_error(y_val, xgb_pred)
|
| xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred))
|
| xgb_r2 = r2_score(y_val, xgb_pred)
|
|
|
| models['XGBoost'] = xgb_model
|
| results['XGBoost'] = {
|
| 'model': xgb_model,
|
| 'mae': xgb_mae,
|
| 'rmse': xgb_rmse,
|
| 'r2': xgb_r2,
|
| 'predictions': xgb_pred
|
| }
|
|
|
| print(f" MAE: {xgb_mae:.2f}, RMSE: {xgb_rmse:.2f}, R2: {xgb_r2:.4f}")
|
| else:
|
| print("\n3. XGBoost skipped (not available)")
|
|
|
| return results
|
|
|
|
|
| def prepare_time_series_data(df):
|
| """
|
| Prepare time-series data by aggregating daily sales.
|
|
|
| Args:
|
| df: DataFrame with date and sales_quantity columns
|
|
|
| Returns:
|
| tuple: (ts_data, train_size) - time series data and training size
|
| """
|
| print("\n" + "="*60)
|
| print("PREPARING TIME-SERIES DATA")
|
| print("="*60)
|
|
|
|
|
| df['date'] = pd.to_datetime(df['date'])
|
| ts_data = df.groupby('date')['sales_quantity'].sum().reset_index()
|
| ts_data = ts_data.sort_values('date').reset_index(drop=True)
|
| ts_data.columns = ['ds', 'y']
|
|
|
| print(f"Time-series data shape: {ts_data.shape}")
|
| print(f"Date range: {ts_data['ds'].min()} to {ts_data['ds'].max()}")
|
| print(f"Total days: {len(ts_data)}")
|
|
|
|
|
| train_size = int(len(ts_data) * 0.8)
|
|
|
| return ts_data, train_size
|
|
|
|
|
| def train_arima(ts_data, train_size):
|
| """
|
| Train ARIMA model on time-series data.
|
|
|
| Args:
|
| ts_data: Time-series DataFrame with 'ds' and 'y' columns
|
| train_size: Number of samples for training
|
|
|
| Returns:
|
| dict: Model results dictionary
|
| """
|
| if not ARIMA_AVAILABLE:
|
| return None
|
|
|
| print("\n" + "="*60)
|
| print("TRAINING ARIMA MODEL")
|
| print("="*60)
|
|
|
| try:
|
|
|
| train_data = ts_data['y'].iloc[:train_size].values
|
| val_data = ts_data['y'].iloc[train_size:].values
|
| val_dates = ts_data['ds'].iloc[train_size:].values
|
|
|
| print(f"Training on {len(train_data)} samples")
|
| print(f"Validating on {len(val_data)} samples")
|
|
|
|
|
|
|
| best_aic = np.inf
|
| best_order = None
|
| best_model = None
|
|
|
|
|
| orders_to_try = [
|
| (1, 1, 1),
|
| (2, 1, 2),
|
| (1, 1, 0),
|
| (0, 1, 1),
|
| (2, 1, 1),
|
| (1, 1, 2),
|
| ]
|
|
|
| print("Trying different ARIMA orders...")
|
| for order in orders_to_try:
|
| try:
|
| model = ARIMA(train_data, order=order)
|
| fitted_model = model.fit()
|
| aic = fitted_model.aic
|
|
|
| if aic < best_aic:
|
| best_aic = aic
|
| best_order = order
|
| best_model = fitted_model
|
| print(f" Order {order}: AIC = {aic:.2f} (best so far)")
|
| else:
|
| print(f" Order {order}: AIC = {aic:.2f}")
|
| except Exception as e:
|
| print(f" Order {order}: Failed - {str(e)[:50]}")
|
| continue
|
|
|
| if best_model is None:
|
| print("Failed to fit ARIMA model with any order")
|
| return None
|
|
|
| print(f"\nBest ARIMA order: {best_order} (AIC: {best_aic:.2f})")
|
|
|
|
|
| forecast_steps = len(val_data)
|
| forecast = best_model.forecast(steps=forecast_steps)
|
|
|
|
|
| forecast = np.maximum(forecast, 0)
|
|
|
|
|
| mae = mean_absolute_error(val_data, forecast)
|
| rmse = np.sqrt(mean_squared_error(val_data, forecast))
|
| r2 = r2_score(val_data, forecast)
|
|
|
| print(f" MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}")
|
|
|
| return {
|
| 'model': best_model,
|
| 'order': best_order,
|
| 'mae': mae,
|
| 'rmse': rmse,
|
| 'r2': r2,
|
| 'predictions': forecast,
|
| 'actual': val_data,
|
| 'dates': val_dates
|
| }
|
|
|
| except Exception as e:
|
| print(f"Error training ARIMA: {str(e)}")
|
| return None
|
|
|
|
|
| def train_prophet(ts_data, train_size):
|
| """
|
| Train Prophet model on time-series data.
|
|
|
| Args:
|
| ts_data: Time-series DataFrame with 'ds' and 'y' columns
|
| train_size: Number of samples for training
|
|
|
| Returns:
|
| dict: Model results dictionary
|
| """
|
| if not PROPHET_AVAILABLE:
|
| return None
|
|
|
| print("\n" + "="*60)
|
| print("TRAINING PROPHET MODEL")
|
| print("="*60)
|
|
|
| try:
|
|
|
| train_data = ts_data.iloc[:train_size].copy()
|
| val_data = ts_data.iloc[train_size:].copy()
|
|
|
| print(f"Training on {len(train_data)} samples")
|
| print(f"Validating on {len(val_data)} samples")
|
|
|
|
|
|
|
| model = Prophet(
|
| daily_seasonality=False,
|
| weekly_seasonality=True,
|
| yearly_seasonality=True,
|
| seasonality_mode='multiplicative',
|
| changepoint_prior_scale=0.05
|
| )
|
|
|
| print("Fitting Prophet model...")
|
| model.fit(train_data)
|
|
|
|
|
| future = model.make_future_dataframe(periods=len(val_data), freq='D')
|
|
|
|
|
| forecast = model.predict(future)
|
|
|
|
|
| val_forecast = forecast.iloc[train_size:]['yhat'].values
|
| val_actual = val_data['y'].values
|
|
|
|
|
| val_forecast = np.maximum(val_forecast, 0)
|
|
|
|
|
| mae = mean_absolute_error(val_actual, val_forecast)
|
| rmse = np.sqrt(mean_squared_error(val_actual, val_forecast))
|
| r2 = r2_score(val_actual, val_forecast)
|
|
|
| print(f" MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}")
|
|
|
| return {
|
| 'model': model,
|
| 'mae': mae,
|
| 'rmse': rmse,
|
| 'r2': r2,
|
| 'predictions': val_forecast,
|
| 'actual': val_actual,
|
| 'dates': val_data['ds'].values,
|
| 'full_forecast': forecast
|
| }
|
|
|
| except Exception as e:
|
| print(f"Error training Prophet: {str(e)}")
|
| import traceback
|
| traceback.print_exc()
|
| return None
|
|
|
|
|
| def select_best_model(results):
|
| """
|
| Select the best model based on R2 score (higher is better).
|
|
|
| Args:
|
| results: Dictionary containing model results
|
|
|
| Returns:
|
| tuple: (best_model_name, best_model, best_metrics)
|
| """
|
| print("\n" + "="*60)
|
| print("MODEL COMPARISON")
|
| print("="*60)
|
|
|
|
|
| comparison_data = []
|
| for model_name, metrics in results.items():
|
| comparison_data.append({
|
| 'Model': model_name,
|
| 'MAE': metrics['mae'],
|
| 'RMSE': metrics['rmse'],
|
| 'R2 Score': metrics['r2']
|
| })
|
|
|
| comparison_df = pd.DataFrame(comparison_data)
|
| print("\nModel Performance Comparison:")
|
| print(comparison_df.to_string(index=False))
|
|
|
|
|
| best_model_name = max(results.keys(), key=lambda x: results[x]['r2'])
|
| best_model = results[best_model_name]['model']
|
| best_metrics = {
|
| 'mae': results[best_model_name]['mae'],
|
| 'rmse': results[best_model_name]['rmse'],
|
| 'r2': results[best_model_name]['r2']
|
| }
|
|
|
| print(f"\n{'='*60}")
|
| print(f"BEST MODEL: {best_model_name}")
|
| print(f"MAE: {best_metrics['mae']:.2f}")
|
| print(f"RMSE: {best_metrics['rmse']:.2f}")
|
| print(f"R2 Score: {best_metrics['r2']:.4f}")
|
| print(f"{'='*60}")
|
|
|
| return best_model_name, best_model, best_metrics
|
|
|
|
|
| def visualize_results(df, results, best_model_name, feature_names):
|
| """
|
| Create visualizations: demand trends, feature importance, model comparison.
|
|
|
| Args:
|
| df: Original DataFrame
|
| results: Model results dictionary
|
| best_model_name: Name of the best model
|
| feature_names: List of feature names
|
| """
|
| print("\n" + "="*60)
|
| print("GENERATING VISUALIZATIONS")
|
| print("="*60)
|
|
|
|
|
| sns.set_style("whitegrid")
|
| plt.rcParams['figure.figsize'] = (12, 6)
|
|
|
|
|
| print("1. Plotting demand trends over time...")
|
| df['date'] = pd.to_datetime(df['date'])
|
| daily_demand = df.groupby('date')['sales_quantity'].sum().reset_index()
|
|
|
| plt.figure(figsize=(14, 6))
|
| plt.plot(daily_demand['date'], daily_demand['sales_quantity'], linewidth=1, alpha=0.7)
|
| plt.title('Total Daily Sales Quantity Over Time', fontsize=16, fontweight='bold')
|
| plt.xlabel('Date', fontsize=12)
|
| plt.ylabel('Total Sales Quantity', fontsize=12)
|
| plt.grid(True, alpha=0.3)
|
| plt.tight_layout()
|
| plt.savefig(f'{PLOTS_DIR}/demand_trends.png', dpi=300, bbox_inches='tight')
|
| print(f" Saved: {PLOTS_DIR}/demand_trends.png")
|
| plt.close()
|
|
|
|
|
| print("2. Plotting monthly average demand...")
|
| df['month_name'] = pd.to_datetime(df['date']).dt.strftime('%B')
|
| monthly_avg = df.groupby('month')['sales_quantity'].mean().reset_index()
|
| month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
|
| 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
| monthly_avg['month_name'] = monthly_avg['month'].apply(lambda x: month_names[x-1])
|
|
|
| plt.figure(figsize=(12, 6))
|
| plt.bar(monthly_avg['month_name'], monthly_avg['sales_quantity'], color='steelblue', alpha=0.7)
|
| plt.title('Average Sales Quantity by Month', fontsize=16, fontweight='bold')
|
| plt.xlabel('Month', fontsize=12)
|
| plt.ylabel('Average Sales Quantity', fontsize=12)
|
| plt.xticks(rotation=45)
|
| plt.grid(True, alpha=0.3, axis='y')
|
| plt.tight_layout()
|
| plt.savefig(f'{PLOTS_DIR}/monthly_demand.png', dpi=300, bbox_inches='tight')
|
| print(f" Saved: {PLOTS_DIR}/monthly_demand.png")
|
| plt.close()
|
|
|
|
|
| print("3. Plotting feature importance...")
|
| best_model = results[best_model_name]['model']
|
|
|
| if hasattr(best_model, 'feature_importances_'):
|
| importances = best_model.feature_importances_
|
| feature_importance_df = pd.DataFrame({
|
| 'feature': feature_names,
|
| 'importance': importances
|
| }).sort_values('importance', ascending=False)
|
|
|
| plt.figure(figsize=(10, 6))
|
| plt.barh(feature_importance_df['feature'], feature_importance_df['importance'], color='coral', alpha=0.7)
|
| plt.title(f'Feature Importance - {best_model_name}', fontsize=16, fontweight='bold')
|
| plt.xlabel('Importance', fontsize=12)
|
| plt.ylabel('Feature', fontsize=12)
|
| plt.gca().invert_yaxis()
|
| plt.grid(True, alpha=0.3, axis='x')
|
| plt.tight_layout()
|
| plt.savefig(f'{PLOTS_DIR}/feature_importance.png', dpi=300, bbox_inches='tight')
|
| print(f" Saved: {PLOTS_DIR}/feature_importance.png")
|
| plt.close()
|
| else:
|
| print(" Feature importance not available for this model type")
|
|
|
|
|
| print("4. Plotting model comparison...")
|
| model_names = list(results.keys())
|
| mae_scores = [results[m]['mae'] for m in model_names]
|
| rmse_scores = [results[m]['rmse'] for m in model_names]
|
| r2_scores = [results[m]['r2'] for m in model_names]
|
|
|
|
|
| ml_models = [m for m in model_names if m not in ['ARIMA', 'Prophet']]
|
| ts_models = [m for m in model_names if m in ['ARIMA', 'Prophet']]
|
|
|
| fig, axes = plt.subplots(1, 3, figsize=(18, 5))
|
|
|
|
|
| colors = []
|
| for m in model_names:
|
| if m in ts_models:
|
| colors.append('coral' if m == 'ARIMA' else 'salmon')
|
| else:
|
| colors.append('skyblue')
|
|
|
|
|
| axes[0].bar(model_names, mae_scores, color=colors, alpha=0.7)
|
| axes[0].set_title('MAE Comparison (Lower is Better)', fontsize=14, fontweight='bold')
|
| axes[0].set_ylabel('MAE', fontsize=12)
|
| axes[0].tick_params(axis='x', rotation=45)
|
| axes[0].grid(True, alpha=0.3, axis='y')
|
|
|
| from matplotlib.patches import Patch
|
| legend_elements = [
|
| Patch(facecolor='skyblue', alpha=0.7, label='ML Models'),
|
| Patch(facecolor='coral', alpha=0.7, label='Time-Series Models')
|
| ]
|
| axes[0].legend(handles=legend_elements, loc='upper right')
|
|
|
|
|
| axes[1].bar(model_names, rmse_scores, color=colors, alpha=0.7)
|
| axes[1].set_title('RMSE Comparison (Lower is Better)', fontsize=14, fontweight='bold')
|
| axes[1].set_ylabel('RMSE', fontsize=12)
|
| axes[1].tick_params(axis='x', rotation=45)
|
| axes[1].grid(True, alpha=0.3, axis='y')
|
|
|
|
|
| axes[2].bar(model_names, r2_scores, color=colors, alpha=0.7)
|
| axes[2].set_title('R2 Score Comparison (Higher is Better)', fontsize=14, fontweight='bold')
|
| axes[2].set_ylabel('R2 Score', fontsize=12)
|
| axes[2].tick_params(axis='x', rotation=45)
|
| axes[2].grid(True, alpha=0.3, axis='y')
|
|
|
| plt.tight_layout()
|
| plt.savefig(f'{PLOTS_DIR}/model_comparison.png', dpi=300, bbox_inches='tight')
|
| print(f" Saved: {PLOTS_DIR}/model_comparison.png")
|
| plt.close()
|
|
|
|
|
| if ts_models:
|
| print("5. Plotting time-series model predictions...")
|
| fig, axes = plt.subplots(len(ts_models), 1, figsize=(14, 6*len(ts_models)))
|
| if len(ts_models) == 1:
|
| axes = [axes]
|
|
|
| for idx, model_name in enumerate(ts_models):
|
| if model_name in results and 'dates' in results[model_name]:
|
| dates = pd.to_datetime(results[model_name]['dates'])
|
| actual = results[model_name]['actual']
|
| predictions = results[model_name]['predictions']
|
|
|
| axes[idx].plot(dates, actual, label='Actual', linewidth=2, alpha=0.7)
|
| axes[idx].plot(dates, predictions, label='Predicted', linewidth=2, alpha=0.7, linestyle='--')
|
| axes[idx].set_title(f'{model_name} - Actual vs Predicted', fontsize=14, fontweight='bold')
|
| axes[idx].set_xlabel('Date', fontsize=12)
|
| axes[idx].set_ylabel('Sales Quantity', fontsize=12)
|
| axes[idx].legend()
|
| axes[idx].grid(True, alpha=0.3)
|
|
|
| plt.tight_layout()
|
| plt.savefig(f'{PLOTS_DIR}/timeseries_predictions.png', dpi=300, bbox_inches='tight')
|
| print(f" Saved: {PLOTS_DIR}/timeseries_predictions.png")
|
| plt.close()
|
|
|
| print(" Visualization complete!")
|
|
|
|
|
| def save_model(model, encoders, scaler, feature_names, best_model_name, best_metrics):
|
| """
|
| Save the trained model and preprocessing objects.
|
|
|
| Args:
|
| model: Trained model
|
| encoders: Dictionary of encoders
|
| scaler: Fitted scaler
|
| feature_names: List of feature names
|
| best_model_name: Name of the best model
|
| best_metrics: Dictionary of metrics
|
| """
|
| print("\n" + "="*60)
|
| print("SAVING MODEL")
|
| print("="*60)
|
|
|
|
|
| model_path = f'{MODEL_DIR}/best_model.joblib'
|
| joblib.dump(model, model_path)
|
| print(f"Model saved to: {model_path}")
|
|
|
|
|
| preprocessing_path = f'{MODEL_DIR}/preprocessing.joblib'
|
| preprocessing_data = {
|
| 'encoders': encoders,
|
| 'scaler': scaler,
|
| 'feature_names': feature_names
|
| }
|
| joblib.dump(preprocessing_data, preprocessing_path)
|
| print(f"Preprocessing objects saved to: {preprocessing_path}")
|
|
|
|
|
| metadata = {
|
| 'model_name': best_model_name,
|
| 'metrics': best_metrics,
|
| 'feature_names': feature_names,
|
| 'saved_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
| }
|
|
|
| import json
|
| metadata_path = f'{MODEL_DIR}/model_metadata.json'
|
| with open(metadata_path, 'w') as f:
|
| json.dump(metadata, f, indent=4)
|
| print(f"Model metadata saved to: {metadata_path}")
|
|
|
|
|
| def main():
|
| """
|
| Main function to orchestrate the training pipeline.
|
| """
|
| print("\n" + "="*60)
|
| print("DEMAND PREDICTION SYSTEM - MODEL TRAINING")
|
| print("ML Models vs Time-Series Models Comparison")
|
| print("="*60)
|
|
|
|
|
| df = load_data(DATA_PATH)
|
|
|
|
|
| df_processed = preprocess_data(df)
|
|
|
|
|
| X, y, feature_names, encoders, scaler = feature_engineering(df_processed)
|
|
|
|
|
| print("\n" + "="*60)
|
| print("SPLITTING DATA FOR ML MODELS")
|
| print("="*60)
|
| X_train, X_val, y_train, y_val = train_test_split(
|
| X, y, test_size=0.2, random_state=42
|
| )
|
| print(f"Training set: {X_train.shape[0]} samples")
|
| print(f"Validation set: {X_val.shape[0]} samples")
|
|
|
|
|
| print("\n" + "="*70)
|
| print("TRAINING MACHINE LEARNING MODELS")
|
| print("="*70)
|
| results = train_models(X_train, y_train, X_val, y_val)
|
|
|
|
|
| ts_data, train_size = prepare_time_series_data(df_processed)
|
|
|
|
|
| print("\n" + "="*70)
|
| print("TRAINING TIME-SERIES MODELS")
|
| print("="*70)
|
|
|
|
|
| if ARIMA_AVAILABLE:
|
| arima_results = train_arima(ts_data, train_size)
|
| if arima_results:
|
| results['ARIMA'] = arima_results
|
| else:
|
| print("\nARIMA skipped (statsmodels not available)")
|
|
|
|
|
| if PROPHET_AVAILABLE:
|
| prophet_results = train_prophet(ts_data, train_size)
|
| if prophet_results:
|
| results['Prophet'] = prophet_results
|
| else:
|
| print("\nProphet skipped (prophet not available)")
|
|
|
|
|
| best_model_name, best_model, best_metrics = select_best_model(results)
|
|
|
|
|
| visualize_results(df_processed, results, best_model_name, feature_names)
|
|
|
|
|
|
|
| if best_model_name not in ['ARIMA', 'Prophet']:
|
| save_model(best_model, encoders, scaler, feature_names, best_model_name, best_metrics)
|
| else:
|
|
|
| print("\n" + "="*60)
|
| print("SAVING TIME-SERIES MODEL")
|
| print("="*60)
|
| ts_model_path = f'{MODEL_DIR}/best_timeseries_model.joblib'
|
| joblib.dump(best_model, ts_model_path)
|
| print(f"Time-series model saved to: {ts_model_path}")
|
|
|
|
|
| preprocessing_path = f'{MODEL_DIR}/preprocessing.joblib'
|
| preprocessing_data = {
|
| 'encoders': encoders,
|
| 'scaler': scaler,
|
| 'feature_names': feature_names
|
| }
|
| joblib.dump(preprocessing_data, preprocessing_path)
|
| print(f"ML preprocessing objects saved to: {preprocessing_path}")
|
|
|
|
|
| import json
|
| all_models_metadata = {
|
| 'best_model': best_model_name,
|
| 'best_metrics': best_metrics,
|
| 'all_models': {}
|
| }
|
| for model_name, model_results in results.items():
|
| all_models_metadata['all_models'][model_name] = {
|
| 'mae': model_results['mae'],
|
| 'rmse': model_results['rmse'],
|
| 'r2': model_results['r2']
|
| }
|
| all_models_metadata['saved_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
| metadata_path = f'{MODEL_DIR}/all_models_metadata.json'
|
| with open(metadata_path, 'w') as f:
|
| json.dump(all_models_metadata, f, indent=4)
|
| print(f"All models metadata saved to: {metadata_path}")
|
|
|
| print("\n" + "="*60)
|
| print("TRAINING COMPLETE!")
|
| print("="*60)
|
| print(f"\nBest model: {best_model_name}")
|
| print(f"Model type: {'Time-Series' if best_model_name in ['ARIMA', 'Prophet'] else 'Machine Learning'}")
|
| print(f"Model saved to: {MODEL_DIR}/")
|
| print(f"Visualizations saved to: {PLOTS_DIR}/")
|
| print("\nYou can now use predict.py to make predictions!")
|
|
|
|
|
| if __name__ == "__main__":
|
| main()
|
|
|