Upload 6 files
Browse files- README.md +27 -3
- beast_spam_model.pt +3 -0
- beast_spam_model.safetensors +3 -0
- check_spam.py +34 -0
- model.py +32 -0
- requirements.txt +4 -0
README.md
CHANGED
|
@@ -1,3 +1,27 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🧠 Beast Spam Detector
|
| 2 |
+
|
| 3 |
+
This is a spam detection model built from scratch using PyTorch. It includes:
|
| 4 |
+
|
| 5 |
+
- Custom tokenizer
|
| 6 |
+
- CNN + BiLSTM model
|
| 7 |
+
- Trained weights (.pt and .safetensors)
|
| 8 |
+
- Easy-to-use inference script
|
| 9 |
+
|
| 10 |
+
## 📦 Usage
|
| 11 |
+
|
| 12 |
+
```bash
|
| 13 |
+
python check_spam.py
|
| 14 |
+
```
|
| 15 |
+
|
| 16 |
+
Type your email content and press Enter twice to get prediction.
|
| 17 |
+
|
| 18 |
+
## 🧠 Model
|
| 19 |
+
|
| 20 |
+
Built using custom tokenizer and a CNN+LSTM-based architecture. Safe to use.
|
| 21 |
+
|
| 22 |
+
## 📁 Files
|
| 23 |
+
|
| 24 |
+
- `beast_spam_model.pt`: PyTorch weights
|
| 25 |
+
- `beast_spam_model.safetensors`: Safe format model
|
| 26 |
+
- `model.py`: Tokenizer + model
|
| 27 |
+
- `check_spam.py`: Inference script
|
beast_spam_model.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f6cb7a302306b414092528a5bd0c0f324715b47bb4ace0ff8a42d489f16c872
|
| 3 |
+
size 3290696
|
beast_spam_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6851494e1d42d84ee3c1d0d15d110d81892da3160b190b112a7a8460eb52962d
|
| 3 |
+
size 216
|
check_spam.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import re
|
| 3 |
+
from model import BeastTokenizer, BeastSpamModel
|
| 4 |
+
from safetensors.torch import load_file
|
| 5 |
+
|
| 6 |
+
def predict_spam(text, tokenizer, model):
|
| 7 |
+
cleaned = re.sub(r"\s+", " ", re.sub(r"\W", " ", re.sub(r"http\S+", "", text.lower()))).strip()
|
| 8 |
+
encoded = tokenizer.encode(cleaned)
|
| 9 |
+
tensor = torch.tensor([encoded], dtype=torch.long)
|
| 10 |
+
with torch.no_grad():
|
| 11 |
+
output = model(tensor).item()
|
| 12 |
+
return "🔥 It is SPAM!" if output > 0.5 else "✅ It is NOT spam."
|
| 13 |
+
|
| 14 |
+
if __name__ == "__main__":
|
| 15 |
+
print("📩 Enter the full email content below (press Enter twice to finish):\n")
|
| 16 |
+
lines = []
|
| 17 |
+
while True:
|
| 18 |
+
line = input()
|
| 19 |
+
if line.strip() == "":
|
| 20 |
+
break
|
| 21 |
+
lines.append(line)
|
| 22 |
+
email = "\n".join(lines)
|
| 23 |
+
|
| 24 |
+
# Load tokenizer vocab (manually or from file)
|
| 25 |
+
texts = ["this is dummy tokenizer data"]
|
| 26 |
+
tokenizer = BeastTokenizer(texts)
|
| 27 |
+
|
| 28 |
+
# Load model
|
| 29 |
+
model = BeastSpamModel(len(tokenizer.word2idx))
|
| 30 |
+
model.load_state_dict(load_file("beast_spam_model.safetensors"))
|
| 31 |
+
model.eval()
|
| 32 |
+
|
| 33 |
+
print("\n[🔍] Checking email...")
|
| 34 |
+
print(f"[🧠] Result: {predict_spam(email, tokenizer, model)}")
|
model.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from collections import Counter
|
| 4 |
+
|
| 5 |
+
class BeastTokenizer:
|
| 6 |
+
def __init__(self, texts=[], vocab_size=5000):
|
| 7 |
+
self.word2idx = {'<PAD>': 0, '<UNK>': 1}
|
| 8 |
+
if texts:
|
| 9 |
+
counter = Counter(word for text in texts for word in text.split())
|
| 10 |
+
common = counter.most_common(vocab_size - 2)
|
| 11 |
+
self.word2idx.update({word: idx + 2 for idx, (word, _) in enumerate(common)})
|
| 12 |
+
|
| 13 |
+
def encode(self, text, max_len=100):
|
| 14 |
+
tokens = [self.word2idx.get(word, 1) for word in text.split()]
|
| 15 |
+
return tokens[:max_len] + [0] * (max_len - len(tokens))
|
| 16 |
+
|
| 17 |
+
class BeastSpamModel(nn.Module):
|
| 18 |
+
def __init__(self, vocab_size, embed_dim=128, hidden_dim=64):
|
| 19 |
+
super().__init__()
|
| 20 |
+
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
|
| 21 |
+
self.conv = nn.Conv1d(embed_dim, 128, kernel_size=5, padding=2)
|
| 22 |
+
self.lstm = nn.LSTM(128, hidden_dim, batch_first=True, bidirectional=True)
|
| 23 |
+
self.fc = nn.Linear(hidden_dim * 2, 1)
|
| 24 |
+
self.sigmoid = nn.Sigmoid()
|
| 25 |
+
|
| 26 |
+
def forward(self, x):
|
| 27 |
+
x = self.embedding(x)
|
| 28 |
+
x = x.permute(0, 2, 1)
|
| 29 |
+
x = self.conv(x).permute(0, 2, 1)
|
| 30 |
+
lstm_out, _ = self.lstm(x)
|
| 31 |
+
out = self.fc(lstm_out[:, -1, :])
|
| 32 |
+
return self.sigmoid(out).squeeze(1)
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
safetensors
|
| 3 |
+
datasets
|
| 4 |
+
scikit-learn
|