Text Classification
Transformers
Safetensors
Turkish
quality_classifier
feature-extraction
quality-classifier
data-filtering
pretraining
custom_code
Instructions to use AdaMLLab/mmBERT-Turkish-Quality-Classifier with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use AdaMLLab/mmBERT-Turkish-Quality-Classifier with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-classification", model="AdaMLLab/mmBERT-Turkish-Quality-Classifier", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("AdaMLLab/mmBERT-Turkish-Quality-Classifier", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """ | |
| Configuration for HQ document quality classifiers. | |
| Defines language-specific settings, dataset sources, and training hyperparameters | |
| for the FineWeb2-HQ methodology. | |
| """ | |
| from pathlib import Path | |
| # ============================================================================= | |
| # Paths | |
| # ============================================================================= | |
| HQ_DIR = Path(__file__).parent | |
| SRC_DIR = HQ_DIR.parent | |
| BASE_DIR = SRC_DIR.parent | |
| # ============================================================================= | |
| # Available Encoder Models | |
| # ============================================================================= | |
| ENCODER_MODELS = { | |
| "mmbert-small": { | |
| "model_name": "jhu-clsp/mmBERT-small", | |
| "max_length": 512, | |
| "embedding_dim": 384, | |
| "description": "mmBERT-small: Modern multilingual encoder (1800+ languages)", | |
| }, | |
| "mmbert-base": { | |
| "model_name": "jhu-clsp/mmBERT-base", | |
| "max_length": 512, | |
| "embedding_dim": 768, | |
| "description": "mmBERT-base: Larger multilingual encoder (1800+ languages)", | |
| }, | |
| "xlm-roberta-base": { | |
| "model_name": "FacebookAI/xlm-roberta-base", | |
| "max_length": 512, | |
| "embedding_dim": 768, | |
| "description": "XLM-RoBERTa-base: Classic multilingual encoder (100 languages)", | |
| }, | |
| "xlm-roberta-large": { | |
| "model_name": "FacebookAI/xlm-roberta-large", | |
| "max_length": 512, | |
| "embedding_dim": 1024, | |
| "description": "XLM-RoBERTa-large: Larger classic multilingual encoder", | |
| }, | |
| } | |
| # Default encoder | |
| DEFAULT_ENCODER = "mmbert-small" | |
| # ============================================================================= | |
| # Embedding Model Configuration (default) | |
| # ============================================================================= | |
| EMBEDDING_CONFIG = ENCODER_MODELS[DEFAULT_ENCODER].copy() | |
| # ============================================================================= | |
| # Classifier Training Configuration | |
| # ============================================================================= | |
| TRAINING_CONFIG = { | |
| "epochs": 6, | |
| "learning_rate": 0.0003, | |
| "batch_size": 256, | |
| "hidden_dim": 256, | |
| "dropout": 0.2, | |
| "embedding_batch_size": 32, | |
| } | |
| # ============================================================================= | |
| # Language-Specific Configuration | |
| # ============================================================================= | |
| LANGUAGE_CONFIG = { | |
| "ara_Arab": { | |
| "name": "Arabic", | |
| "answer_label": "الإجابة:", | |
| "positive_datasets": [ | |
| { | |
| "dataset_id": "MBZUAI/ArabicMMLU", | |
| "subset": "All", | |
| "split": "test", | |
| "format_type": "mcq", | |
| "text_field": None, # Use formatter | |
| }, | |
| { | |
| "dataset_id": "openai/MMMLU", | |
| "subset": "AR_XY", | |
| "split": "test", | |
| "format_type": "mcq", | |
| "text_field": None, | |
| }, | |
| { | |
| "dataset_id": "CohereForAI/aya_dataset", | |
| "subset": None, | |
| "split": "train", | |
| "format_type": "instruction", | |
| "text_field": None, | |
| "language_filter": "Arabic", | |
| }, | |
| ], | |
| "negative_source": { | |
| "dataset_id": "uonlp/CulturaX", | |
| "subset": "ar", | |
| "split": "train", | |
| "text_field": "text", | |
| }, | |
| }, | |
| "hin_Deva": { | |
| "name": "Hindi", | |
| "answer_label": "उत्तर:", | |
| "positive_datasets": [ | |
| { | |
| "dataset_id": "openai/MMMLU", | |
| "subset": "HI_IN", | |
| "split": "test", | |
| "format_type": "mcq", | |
| "text_field": None, | |
| }, | |
| { | |
| "dataset_id": "CohereForAI/aya_dataset", | |
| "subset": None, | |
| "split": "train", | |
| "format_type": "instruction", | |
| "text_field": None, | |
| "language_filter": "Hindi", | |
| }, | |
| ], | |
| "negative_source": { | |
| "dataset_id": "uonlp/CulturaX", | |
| "subset": "hi", | |
| "split": "train", | |
| "text_field": "text", | |
| }, | |
| }, | |
| "tur_Latn": { | |
| "name": "Turkish", | |
| "answer_label": "Cevap:", | |
| "positive_datasets": [ | |
| { | |
| "dataset_id": "AYueksel/TurkishMMLU", | |
| "subset": "All", | |
| "split": "test", | |
| "format_type": "mcq", | |
| "text_field": None, | |
| }, | |
| # Note: openai/MMMLU does not have Turkish | |
| { | |
| "dataset_id": "CohereForAI/aya_dataset", | |
| "subset": None, | |
| "split": "train", | |
| "format_type": "instruction", | |
| "text_field": None, | |
| "language_filter": "Turkish", | |
| }, | |
| ], | |
| "negative_source": { | |
| "dataset_id": "uonlp/CulturaX", | |
| "subset": "tr", | |
| "split": "train", | |
| "text_field": "text", | |
| }, | |
| }, | |
| } | |
| # ============================================================================= | |
| # Supported Languages | |
| # ============================================================================= | |
| SUPPORTED_LANGUAGES = list(LANGUAGE_CONFIG.keys()) | |
| # ============================================================================= | |
| # Default Sampling Configuration | |
| # ============================================================================= | |
| SAMPLING_CONFIG = { | |
| "max_positive_samples": 80000, | |
| "max_negative_samples": 80000, | |
| "min_text_length": 50, | |
| "train_ratio": 0.8, | |
| "valid_ratio": 0.1, | |
| "test_ratio": 0.1, | |
| "random_seed": 42, | |
| } | |