Spaces:
Runtime error
Runtime error
| from decord import VideoReader | |
| import torch | |
| from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel | |
| import gradio as gr | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # load pretrained processor, tokenizer, and model | |
| image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base") | |
| tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
| model = VisionEncoderDecoderModel.from_pretrained( | |
| "Neleac/timesformer-gpt2-video-captioning" | |
| ).to(device) | |
| with gr.Blocks() as demo: | |
| demo.title = "Semantic Summarization of Videos using DLSG" | |
| gr.Markdown('# Semantic Summarization of Videos using DLSG, Demo by Batch_B29') | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| video = gr.Video(label="Upload Video", format="mp4") | |
| generate = gr.Button(value="Generate Caption") | |
| with gr.Column(scale=1): | |
| text = gr.Textbox(label="Caption", placeholder="Caption will appear here") | |
| with gr.Accordion("Settings", open=True): | |
| with gr.Row(): | |
| max_length = gr.Slider( | |
| label="Max Length", minimum=10, maximum=100, value=20, step=1 | |
| ) | |
| min_length = gr.Slider( | |
| label="Min Length", minimum=1, maximum=10, value=10, step=1 | |
| ) | |
| def generate_caption(video, max_length, min_length): | |
| # read video | |
| throughputs=1 | |
| container = VideoReader(video) | |
| clip_len = model.config.encoder.num_frames | |
| frames = container.get_batch( | |
| range(0, len(container), len(container) // (througputs * clip_len)) | |
| ).asnumpy() | |
| frames = [frame for frame in frames[:-1]] | |
| # process frames | |
| # generate caption | |
| gen_kwargs = { | |
| "min_length": min_length, | |
| "max_length": max_length, | |
| } | |
| pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to( | |
| device | |
| ) | |
| tokens = model.generate(pixel_values, **gen_kwargs) | |
| caption = tokenizer.batch_decode(tokens, skip_special_tokens=True)[0] | |
| return caption | |
| generate.click( | |
| generate_caption, | |
| inputs=[video, max_length, min_length], | |
| outputs=text, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |