Spaces:

sitammeur
/

ModernVBERT-Qwen-Retrieval

Running on Zero

App Files Files Community

sitammeur commited on Nov 1

Commit

3345f41

verified ·

1 Parent(s): e9061b2

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -18

app.py CHANGED Viewed

@@ -53,7 +53,7 @@ def convert_files(files):
 ################################################
-# Model Inference with ColPali and Gemini
 ################################################
 @spaces.GPU
 def index_gpu(images, ds):
@@ -81,17 +81,17 @@ def index_gpu(images, ds):
     return f"Uploaded and converted {len(images)} pages", ds, images
-def query_gemini(query, images, api_key):
-    """Calls Google's Gemini model with the query and image data."""
     if api_key:
         try:
             # Convert images to base64 strings
             base64_images = [encode_image_to_base64(image[0]) for image in images]
-            # Initialize the OpenAI client with the Gemini API key
             client = OpenAI(
                 api_key=api_key.strip(),
-                base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
             )
             PROMPT = """
             You are a smart assistant designed to answer questions about a PDF document.
@@ -105,9 +105,9 @@ def query_gemini(query, images, api_key):
             PDF pages:
             """
-            # Get the response from the Gemini API
             response = client.chat.completions.create(
-                model="gemini-2.5-flash-lite",
                 reasoning_effort="none",
                 messages=[
                     {
@@ -132,10 +132,10 @@ def query_gemini(query, images, api_key):
         # Handle errors from the API
         except Exception as e:
-            return "API connection error! Please check your API key and try again."
-    # If no API key is provided, return a message indicating that the user should enter their key
-    return "Enter your Gemini API key to get a custom response."
 ################################################
@@ -175,8 +175,8 @@ def search(query: str, ds, images, k, api_key):
         img_copy = img.copy()
         results.append((img_copy, f"Page {idx}"))
-    # Generate response from Gemini
-    ai_response = query_gemini(query, results, api_key)
     return results, ai_response
@@ -186,11 +186,11 @@ def search(query: str, ds, images, k, api_key):
 ################################################
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     gr.Markdown(
-        "# Multimodal RAG with ColVision & Gemini 📚"
     )
     gr.Markdown(
-        """Demo to test ColQwen2.5 (ColPali) on PDF documents.
-    ColPali is a model implemented from the [ColPali paper](https://arxiv.org/abs/2407.01449).
     This demo allows you to upload PDF files and search for the most relevant pages based on your query.
     Refresh the page if you change documents!
     ⚠️ This demo uses a model trained exclusively on A4 PDFs in portrait mode, containing English text. Performance is expected to drop for other page formats and languages.
@@ -213,8 +213,8 @@ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
         with gr.Column(scale=3):
             gr.Markdown("## 3️⃣ Search")
             api_key = gr.Textbox(
-                placeholder="Enter your Gemini API key here (must be valid)",
-                label="API key",
             )
             query = gr.Textbox(placeholder="Enter your query here", label="Query")
             k = gr.Slider(
@@ -233,7 +233,7 @@ with gr.Blocks(theme=gr.themes.Ocean()) as demo:
         label="Retrieved Documents", height=600, show_label=True
     )
-    gr.Markdown("## 5️⃣ Gemini Response")
     output_text = gr.Textbox(
         label="AI Response",
         placeholder="Generated response based on retrieved documents",

 ################################################
+# Model Inference with ModernVBERT and Qwen
 ################################################
 @spaces.GPU
 def index_gpu(images, ds):
     return f"Uploaded and converted {len(images)} pages", ds, images
+def query_qwen(query, images, api_key):
+    """Calls Qwen model with the query and image data."""
     if api_key:
         try:
             # Convert images to base64 strings
             base64_images = [encode_image_to_base64(image[0]) for image in images]
+            # Initialize the OpenAI client with the Hugging Face token
             client = OpenAI(
                 api_key=api_key.strip(),
+                base_url="https://router.huggingface.co/v1",
             )
             PROMPT = """
             You are a smart assistant designed to answer questions about a PDF document.
             PDF pages:
             """
+            # Get the response from the Qwen inference API
             response = client.chat.completions.create(
+                model="Qwen/Qwen3-VL-30B-A3B-Instruct",
                 reasoning_effort="none",
                 messages=[
                     {
         # Handle errors from the API
         except Exception as e:
+            return "API connection error! Please check your API token and try again."
+    # If no API token is provided, return a message indicating that the user should enter their token
+    return "Enter your Hugging Face token to get a custom response."
 ################################################
         img_copy = img.copy()
         results.append((img_copy, f"Page {idx}"))
+    # Generate response
+    ai_response = query_qwen(query, results, api_key)
     return results, ai_response
 ################################################
 with gr.Blocks(theme=gr.themes.Ocean()) as demo:
     gr.Markdown(
+        "# Multimodal RAG with ModernVBERT & Qwen 📚"
     )
     gr.Markdown(
+        """Demo to test ColModernVBERT (ModernVBERT) on PDF documents.
+    ModernVBERT is a model implemented from the paper [ModernVBERT: Towards Smaller Visual Document Retrievers](https://arxiv.org/abs/2510.01149).
     This demo allows you to upload PDF files and search for the most relevant pages based on your query.
     Refresh the page if you change documents!
     ⚠️ This demo uses a model trained exclusively on A4 PDFs in portrait mode, containing English text. Performance is expected to drop for other page formats and languages.
         with gr.Column(scale=3):
             gr.Markdown("## 3️⃣ Search")
             api_key = gr.Textbox(
+                placeholder="Enter your Hugging Face token here (must be valid)",
+                label="API token",
             )
             query = gr.Textbox(placeholder="Enter your query here", label="Query")
             k = gr.Slider(
         label="Retrieved Documents", height=600, show_label=True
     )
+    gr.Markdown("## 5️⃣ Qwen Response")
     output_text = gr.Textbox(
         label="AI Response",
         placeholder="Generated response based on retrieved documents",