Upload 6 files

Files changed (6) hide show

README.md CHANGED Viewed

@@ -1,3 +1,27 @@
----
-license: mit
----

+# 🧠 Beast Spam Detector
+This is a spam detection model built from scratch using PyTorch. It includes:
+- Custom tokenizer
+- CNN + BiLSTM model
+- Trained weights (.pt and .safetensors)
+- Easy-to-use inference script
+## 📦 Usage
+```bash
+python check_spam.py
+```
+Type your email content and press Enter twice to get prediction.
+## 🧠 Model
+Built using custom tokenizer and a CNN+LSTM-based architecture. Safe to use.
+## 📁 Files
+- `beast_spam_model.pt`: PyTorch weights
+- `beast_spam_model.safetensors`: Safe format model
+- `model.py`: Tokenizer + model
+- `check_spam.py`: Inference script

beast_spam_model.pt ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f6cb7a302306b414092528a5bd0c0f324715b47bb4ace0ff8a42d489f16c872
+size 3290696

beast_spam_model.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:6851494e1d42d84ee3c1d0d15d110d81892da3160b190b112a7a8460eb52962d
+size 216

check_spam.py ADDED Viewed

+import torch
+import re
+from model import BeastTokenizer, BeastSpamModel
+from safetensors.torch import load_file
+def predict_spam(text, tokenizer, model):
+    cleaned = re.sub(r"\s+", " ", re.sub(r"\W", " ", re.sub(r"http\S+", "", text.lower()))).strip()
+    encoded = tokenizer.encode(cleaned)
+    tensor = torch.tensor([encoded], dtype=torch.long)
+    with torch.no_grad():
+        output = model(tensor).item()
+    return "🔥 It is SPAM!" if output > 0.5 else "✅ It is NOT spam."
+if __name__ == "__main__":
+    print("📩 Enter the full email content below (press Enter twice to finish):\n")
+    lines = []
+    while True:
+        line = input()
+        if line.strip() == "":
+            break
+        lines.append(line)
+    email = "\n".join(lines)
+    # Load tokenizer vocab (manually or from file)
+    texts = ["this is dummy tokenizer data"]
+    tokenizer = BeastTokenizer(texts)
+    # Load model
+    model = BeastSpamModel(len(tokenizer.word2idx))
+    model.load_state_dict(load_file("beast_spam_model.safetensors"))
+    model.eval()
+    print("\n[🔍] Checking email...")
+    print(f"[🧠] Result: {predict_spam(email, tokenizer, model)}")

model.py ADDED Viewed

+import torch
+import torch.nn as nn
+from collections import Counter
+class BeastTokenizer:
+    def __init__(self, texts=[], vocab_size=5000):
+        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
+        if texts:
+            counter = Counter(word for text in texts for word in text.split())
+            common = counter.most_common(vocab_size - 2)
+            self.word2idx.update({word: idx + 2 for idx, (word, _) in enumerate(common)})
+    def encode(self, text, max_len=100):
+        tokens = [self.word2idx.get(word, 1) for word in text.split()]
+        return tokens[:max_len] + [0] * (max_len - len(tokens))
+class BeastSpamModel(nn.Module):
+    def __init__(self, vocab_size, embed_dim=128, hidden_dim=64):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
+        self.conv = nn.Conv1d(embed_dim, 128, kernel_size=5, padding=2)
+        self.lstm = nn.LSTM(128, hidden_dim, batch_first=True, bidirectional=True)
+        self.fc = nn.Linear(hidden_dim * 2, 1)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        x = self.embedding(x)
+        x = x.permute(0, 2, 1)
+        x = self.conv(x).permute(0, 2, 1)
+        lstm_out, _ = self.lstm(x)
+        out = self.fc(lstm_out[:, -1, :])
+        return self.sigmoid(out).squeeze(1)

requirements.txt ADDED Viewed

+torch
+safetensors
+datasets
+scikit-learn