Automatic Speech Recognition
Transformers
Safetensors
phi4mm
text-generation
nlp
code
audio
speech-summarization
speech-translation
visual-question-answering
phi-4-multimodal
phi
phi-4-mini
custom_code
Eval Results
Instructions to use microsoft/Phi-4-multimodal-instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use microsoft/Phi-4-multimodal-instruct with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("automatic-speech-recognition", model="microsoft/Phi-4-multimodal-instruct", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-4-multimodal-instruct", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
Commit ·
cdbc4bf
1
Parent(s): cd5c468
delete `CommonKwargs`
Browse filesSigned-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>
processing_phi4_multimodal.py
CHANGED
|
@@ -29,7 +29,7 @@ import PIL.Image
|
|
| 29 |
|
| 30 |
from transformers.image_processing_utils import BatchFeature
|
| 31 |
from transformers.image_utils import ImageInput
|
| 32 |
-
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs,
|
| 33 |
from transformers.tokenization_utils_base import TextInput
|
| 34 |
from transformers.utils import logging
|
| 35 |
|
|
@@ -73,7 +73,7 @@ class ChatTemplateLoadKwargs(TypedDict, total=False):
|
|
| 73 |
|
| 74 |
|
| 75 |
class AllKwargsForChatTemplate(
|
| 76 |
-
TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs,
|
| 77 |
):
|
| 78 |
processor_kwargs: ProcessingKwargs = {
|
| 79 |
**ProcessingKwargs.__annotations__,
|
|
|
|
| 29 |
|
| 30 |
from transformers.image_processing_utils import BatchFeature
|
| 31 |
from transformers.image_utils import ImageInput
|
| 32 |
+
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, ProcessorChatTemplateKwargs
|
| 33 |
from transformers.tokenization_utils_base import TextInput
|
| 34 |
from transformers.utils import logging
|
| 35 |
|
|
|
|
| 73 |
|
| 74 |
|
| 75 |
class AllKwargsForChatTemplate(
|
| 76 |
+
TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, ProcessorChatTemplateKwargs
|
| 77 |
):
|
| 78 |
processor_kwargs: ProcessingKwargs = {
|
| 79 |
**ProcessingKwargs.__annotations__,
|