File size: 5,420 Bytes
63590dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import os

class DataHandler:
    def __init__(self, data_path='data/Fraud.csv'):
        self.data_path = data_path
        self.scaler = MinMaxScaler()
        
    def haversine_distance(self, lat1, lon1, lat2, lon2):
        """Calculate haversine distance between two points"""
        R = 6371  # Earth radius in km
        
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))
        
        return R * c
    
    def engineer_features(self, df):
        """Apply feature engineering to the dataset"""
        print("Starting feature engineering...")
        
        # Convert datetime
        df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
        df['dob'] = pd.to_datetime(df['dob'])
        
        # Time features
        df['Hour_of_Day'] = df['trans_date_trans_time'].dt.hour
        df['Day_of_Week'] = df['trans_date_trans_time'].dt.dayofweek
        
        # Age calculation
        df['Age'] = (df['trans_date_trans_time'] - df['dob']).dt.days / 365.25
        
        # Sort by card and time for velocity features
        df = df.sort_values(['cc_num', 'unix_time']).reset_index(drop=True)
        
        # Velocity features - transactions in last 1hr and 24hr (HIGHLY OPTIMIZED)
        print("Calculating velocity features (this may take 2-3 minutes)...")
        
        # Use vectorized operations with merge_asof for speed
        df['Txns_Last_1Hr'] = 0
        df['Txns_Last_24Hr'] = 0
        
        # Process in chunks by card to save memory and speed up
        velocity_results = []
        
        for cc_num, group in df.groupby('cc_num'):
            group = group.sort_values('unix_time').reset_index(drop=True)
            times = group['unix_time'].values
            
            # Vectorized calculation using broadcasting
            time_matrix = times[:, np.newaxis] - times[np.newaxis, :]
            
            # Count transactions in windows (only look backwards, hence time_matrix > 0)
            count_1hr = np.sum((time_matrix > 0) & (time_matrix <= 3600), axis=1)
            count_24hr = np.sum((time_matrix > 0) & (time_matrix <= 86400), axis=1)
            
            group['Txns_Last_1Hr'] = count_1hr
            group['Txns_Last_24Hr'] = count_24hr
            
            velocity_results.append(group)
        
        # Combine all results
        df = pd.concat(velocity_results, ignore_index=True)
        
        # Sort back by time
        df = df.sort_values('unix_time').reset_index(drop=True)
        
        print(f"Velocity features calculated. Sample: 1Hr={df['Txns_Last_1Hr'].mean():.2f}, 24Hr={df['Txns_Last_24Hr'].mean():.2f}")
        
        # Geospatial features - distance between customer and merchant
        print("Calculating geospatial features...")
        df['Haversine_Distance'] = self.haversine_distance(
            df['lat'], df['long'], df['merch_lat'], df['merch_long']
        )
        
        # Target encoding for merchant and category
        print("Applying target encoding...")
        df['Merchant_Fraud_Rate'] = df.groupby('merchant')['is_fraud'].transform('mean')
        df['Category_Fraud_Rate'] = df.groupby('category')['is_fraud'].transform('mean')
        
        # Select features for modeling
        feature_cols = ['amt', 'Age', 'Hour_of_Day', 'Day_of_Week', 
                       'Txns_Last_1Hr', 'Txns_Last_24Hr', 'Haversine_Distance',
                       'Merchant_Fraud_Rate', 'Category_Fraud_Rate', 'city_pop']
        
        # Scale features
        print("Scaling features...")
        scaled_features = self.scaler.fit_transform(df[feature_cols])
        scaled_df = pd.DataFrame(scaled_features, 
                                columns=[f'Scaled_{col}' for col in feature_cols])
        
        # Combine with original
        df = pd.concat([df.reset_index(drop=True), scaled_df], axis=1)
        
        print(f"Feature engineering complete. Final shape: {df.shape}")
        return df
    
    def load_and_process_data(self):
        """Load and process the entire dataset"""
        print(f"Loading data from {self.data_path}...")
        df = pd.read_csv(self.data_path)
        print(f"Loaded {len(df)} rows")
        
        # Engineer features
        df = self.engineer_features(df)
        
        # Save processed data
        output_path = 'data/processed_data.csv'
        print(f"Saving processed data to {output_path}...")
        df.to_csv(output_path, index=False)
        print("Data processing complete!")
        
        return df
    
    def stream_transactions(self, processed_data, batch_size=100):
        """Generator to stream transactions in batches"""
        for i in range(0, len(processed_data), batch_size):
            yield processed_data.iloc[i:i+batch_size]

if __name__ == "__main__":
    # Test the data handler
    handler = DataHandler()
    df = handler.load_and_process_data()
    print("\nFirst few rows of processed data:")
    print(df.head())