Spaces:

SantoshKumar1310
/

QuantumShield

Build error

File size: 5,420 Bytes

63590dc

import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import os

class DataHandler:
    def __init__(self, data_path='data/Fraud.csv'):
        self.data_path = data_path
        self.scaler = MinMaxScaler()
        
    def haversine_distance(self, lat1, lon1, lat2, lon2):
        """Calculate haversine distance between two points"""
        R = 6371  # Earth radius in km
        
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))
        
        return R * c
    
    def engineer_features(self, df):
        """Apply feature engineering to the dataset"""
        print("Starting feature engineering...")
        
        # Convert datetime
        df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
        df['dob'] = pd.to_datetime(df['dob'])
        
        # Time features
        df['Hour_of_Day'] = df['trans_date_trans_time'].dt.hour
        df['Day_of_Week'] = df['trans_date_trans_time'].dt.dayofweek
        
        # Age calculation
        df['Age'] = (df['trans_date_trans_time'] - df['dob']).dt.days / 365.25
        
        # Sort by card and time for velocity features
        df = df.sort_values(['cc_num', 'unix_time']).reset_index(drop=True)
        
        # Velocity features - transactions in last 1hr and 24hr (HIGHLY OPTIMIZED)
        print("Calculating velocity features (this may take 2-3 minutes)...")
        
        # Use vectorized operations with merge_asof for speed
        df['Txns_Last_1Hr'] = 0
        df['Txns_Last_24Hr'] = 0
        
        # Process in chunks by card to save memory and speed up
        velocity_results = []
        
        for cc_num, group in df.groupby('cc_num'):
            group = group.sort_values('unix_time').reset_index(drop=True)
            times = group['unix_time'].values
            
            # Vectorized calculation using broadcasting
            time_matrix = times[:, np.newaxis] - times[np.newaxis, :]
            
            # Count transactions in windows (only look backwards, hence time_matrix > 0)
            count_1hr = np.sum((time_matrix > 0) & (time_matrix <= 3600), axis=1)
            count_24hr = np.sum((time_matrix > 0) & (time_matrix <= 86400), axis=1)
            
            group['Txns_Last_1Hr'] = count_1hr
            group['Txns_Last_24Hr'] = count_24hr
            
            velocity_results.append(group)
        
        # Combine all results
        df = pd.concat(velocity_results, ignore_index=True)
        
        # Sort back by time
        df = df.sort_values('unix_time').reset_index(drop=True)
        
        print(f"Velocity features calculated. Sample: 1Hr={df['Txns_Last_1Hr'].mean():.2f}, 24Hr={df['Txns_Last_24Hr'].mean():.2f}")
        
        # Geospatial features - distance between customer and merchant
        print("Calculating geospatial features...")
        df['Haversine_Distance'] = self.haversine_distance(
            df['lat'], df['long'], df['merch_lat'], df['merch_long']
        )
        
        # Target encoding for merchant and category
        print("Applying target encoding...")
        df['Merchant_Fraud_Rate'] = df.groupby('merchant')['is_fraud'].transform('mean')
        df['Category_Fraud_Rate'] = df.groupby('category')['is_fraud'].transform('mean')
        
        # Select features for modeling
        feature_cols = ['amt', 'Age', 'Hour_of_Day', 'Day_of_Week', 
                       'Txns_Last_1Hr', 'Txns_Last_24Hr', 'Haversine_Distance',
                       'Merchant_Fraud_Rate', 'Category_Fraud_Rate', 'city_pop']
        
        # Scale features
        print("Scaling features...")
        scaled_features = self.scaler.fit_transform(df[feature_cols])
        scaled_df = pd.DataFrame(scaled_features, 
                                columns=[f'Scaled_{col}' for col in feature_cols])
        
        # Combine with original
        df = pd.concat([df.reset_index(drop=True), scaled_df], axis=1)
        
        print(f"Feature engineering complete. Final shape: {df.shape}")
        return df
    
    def load_and_process_data(self):
        """Load and process the entire dataset"""
        print(f"Loading data from {self.data_path}...")
        df = pd.read_csv(self.data_path)
        print(f"Loaded {len(df)} rows")
        
        # Engineer features
        df = self.engineer_features(df)
        
        # Save processed data
        output_path = 'data/processed_data.csv'
        print(f"Saving processed data to {output_path}...")
        df.to_csv(output_path, index=False)
        print("Data processing complete!")
        
        return df
    
    def stream_transactions(self, processed_data, batch_size=100):
        """Generator to stream transactions in batches"""
        for i in range(0, len(processed_data), batch_size):
            yield processed_data.iloc[i:i+batch_size]

if __name__ == "__main__":
    # Test the data handler
    handler = DataHandler()
    df = handler.load_and_process_data()
    print("\nFirst few rows of processed data:")
    print(df.head())