lulavc commited on
Commit
eefdc40
·
verified ·
1 Parent(s): d0bcc31

Revert to GLM-4.6V (stable)

Browse files
Files changed (1) hide show
  1. app.py +31 -29
app.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  BubbleScribe - AI Manga & Comic Translator
3
- Translate manga/comics using Qwen2-VL for OCR + Translation and LaMa for inpainting.
4
  Optimized for NVIDIA T4 GPU
5
  """
6
 
@@ -13,7 +13,7 @@ import re
13
  import numpy as np
14
  from PIL import Image, ImageDraw, ImageFont
15
  from io import BytesIO
16
- from huggingface_hub import InferenceClient
17
  from concurrent.futures import ThreadPoolExecutor
18
  import threading
19
 
@@ -90,20 +90,20 @@ def get_font(size: int):
90
  return font
91
 
92
  # ============================================================
93
- # QWEN2-VL CLIENT (HuggingFace Inference API)
94
  # ============================================================
95
 
96
- _hf_client = None
97
 
98
- def get_hf_client():
99
- """Get or create HuggingFace Inference client."""
100
- global _hf_client
101
- if _hf_client is None:
102
- api_key = os.environ.get("HF_TOKEN")
103
  if not api_key:
104
  return None
105
- _hf_client = InferenceClient(api_key=api_key)
106
- return _hf_client
107
 
108
  # ============================================================
109
  # IMAGE UTILITIES
@@ -225,12 +225,12 @@ def safe_parse_json(text: str) -> list:
225
  # ============================================================
226
 
227
  def detect_and_translate(image: Image.Image, source_lang: str, target_lang: str, progress=gr.Progress()):
228
- """Use Qwen2-VL to detect text regions and translate."""
229
- client = get_hf_client()
230
  if not client:
231
- return None, "Error: HF_TOKEN not set in Space secrets"
232
 
233
- progress(0.1, desc="Analyzing image with Qwen2-VL...")
234
 
235
  original_size = image.size
236
 
@@ -281,7 +281,7 @@ CRITICAL: Find at least 20-50 text regions. This image has many text elements. S
281
 
282
  try:
283
  response = client.chat.completions.create(
284
- model="Qwen/Qwen2-VL-7B-Instruct",
285
  messages=[
286
  {
287
  "role": "user",
@@ -300,17 +300,19 @@ CRITICAL: Find at least 20-50 text regions. This image has many text elements. S
300
  progress(0.4, desc="Processing response...")
301
 
302
  result_text = ""
303
- if response.choices and len(response.choices) > 0:
304
- msg = response.choices[0].message
305
- if hasattr(msg, 'content') and msg.content:
306
- result_text = msg.content
307
 
308
- # Strip any special tokens
 
 
 
 
 
 
309
  result_text = result_text.replace('<|begin_of_box|>', '').replace('<|end_of_box|>', '')
310
- result_text = result_text.replace('<|im_start|>', '').replace('<|im_end|>', '')
311
 
312
- print(f"📝 Qwen2-VL Response length: {len(result_text)} chars")
313
- print(f"📝 Qwen2-VL Response preview: {result_text[:500] if result_text else 'EMPTY'}...")
314
 
315
  # Parse JSON from response with robust error handling
316
  detections = safe_parse_json(result_text)
@@ -531,7 +533,7 @@ def translate_manga(image, source_lang, target_lang, show_boxes, apply_inpaint,
531
  image = image.convert('RGB')
532
 
533
  # Step 1: Detect and translate
534
- progress(0.1, desc="🔍 Detecting text with Qwen2-VL...")
535
  detections, status = detect_and_translate(image, source_lang, target_lang, progress)
536
 
537
  if detections is None:
@@ -652,13 +654,13 @@ with gr.Blocks(title="BubbleScribe", css=css, theme=gr.themes.Soft()) as demo:
652
  gr.HTML("""
653
  <div class="header">
654
  <h1>✍️ BubbleScribe</h1>
655
- <p>AI-powered manga & comic translator using Qwen2-VL + LaMa</p>
656
  </div>
657
  """)
658
 
659
  gr.HTML("""
660
  <div class="stats">
661
- ⚡ <strong>Models:</strong> Qwen2-VL (OCR & Translation) + LaMa (Inpainting)
662
  </div>
663
  """)
664
 
@@ -754,13 +756,13 @@ with gr.Blocks(title="BubbleScribe", css=css, theme=gr.themes.Soft()) as demo:
754
  - Sound effects may not always be detected
755
 
756
  ### 🔧 Powered By
757
- - **Qwen2-VL** - Text detection & translation (HuggingFace Inference API)
758
  - **LaMa** - Text removal inpainting (GPU-accelerated)
759
  """)
760
 
761
  gr.HTML("""
762
  <div style="text-align: center; margin-top: 20px; padding: 10px; background: rgba(0,0,0,0.05); border-radius: 8px;">
763
- <strong>Models:</strong> <a href="https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct" target="_blank">Qwen2-VL-7B</a> (OCR & Translation) •
764
  <a href="https://github.com/advimman/lama" target="_blank">LaMa</a> (Inpainting) •
765
  <strong>Created by:</strong> <a href="https://huggingface.co/lulavc" target="_blank">@lulavc</a>
766
  </div>
 
1
  """
2
  BubbleScribe - AI Manga & Comic Translator
3
+ Translate manga/comics using GLM-4.6V for OCR + Translation and LaMa for inpainting.
4
  Optimized for NVIDIA T4 GPU
5
  """
6
 
 
13
  import numpy as np
14
  from PIL import Image, ImageDraw, ImageFont
15
  from io import BytesIO
16
+ from openai import OpenAI
17
  from concurrent.futures import ThreadPoolExecutor
18
  import threading
19
 
 
90
  return font
91
 
92
  # ============================================================
93
+ # GLM-4.6V CLIENT (Z.ai API)
94
  # ============================================================
95
 
96
+ _glm_client = None
97
 
98
+ def get_glm_client():
99
+ """Get or create GLM client."""
100
+ global _glm_client
101
+ if _glm_client is None:
102
+ api_key = os.environ.get("GLM_API_KEY")
103
  if not api_key:
104
  return None
105
+ _glm_client = OpenAI(api_key=api_key, base_url="https://api.z.ai/api/paas/v4")
106
+ return _glm_client
107
 
108
  # ============================================================
109
  # IMAGE UTILITIES
 
225
  # ============================================================
226
 
227
  def detect_and_translate(image: Image.Image, source_lang: str, target_lang: str, progress=gr.Progress()):
228
+ """Use GLM-4.6V to detect text regions and translate."""
229
+ client = get_glm_client()
230
  if not client:
231
+ return None, "Error: GLM_API_KEY not set in Space secrets"
232
 
233
+ progress(0.1, desc="Analyzing image with GLM-4.6V...")
234
 
235
  original_size = image.size
236
 
 
281
 
282
  try:
283
  response = client.chat.completions.create(
284
+ model="glm-4.6v-flash",
285
  messages=[
286
  {
287
  "role": "user",
 
300
  progress(0.4, desc="Processing response...")
301
 
302
  result_text = ""
303
+ msg = response.choices[0].message
 
 
 
304
 
305
+ # Try multiple response fields
306
+ if hasattr(msg, 'content') and msg.content:
307
+ result_text = msg.content
308
+ if hasattr(msg, 'reasoning_content') and msg.reasoning_content:
309
+ result_text = result_text + "\n" + msg.reasoning_content if result_text else msg.reasoning_content
310
+
311
+ # Strip GLM special tokens
312
  result_text = result_text.replace('<|begin_of_box|>', '').replace('<|end_of_box|>', '')
 
313
 
314
+ print(f"📝 GLM-4.6V Response length: {len(result_text)} chars")
315
+ print(f"📝 GLM-4.6V Response preview: {result_text[:500] if result_text else 'EMPTY'}...")
316
 
317
  # Parse JSON from response with robust error handling
318
  detections = safe_parse_json(result_text)
 
533
  image = image.convert('RGB')
534
 
535
  # Step 1: Detect and translate
536
+ progress(0.1, desc="🔍 Detecting text with GLM-4.6V...")
537
  detections, status = detect_and_translate(image, source_lang, target_lang, progress)
538
 
539
  if detections is None:
 
654
  gr.HTML("""
655
  <div class="header">
656
  <h1>✍️ BubbleScribe</h1>
657
+ <p>AI-powered manga & comic translator using GLM-4.6V + LaMa</p>
658
  </div>
659
  """)
660
 
661
  gr.HTML("""
662
  <div class="stats">
663
+ ⚡ <strong>Models:</strong> GLM-4.6V (OCR & Translation) + LaMa (Inpainting)
664
  </div>
665
  """)
666
 
 
756
  - Sound effects may not always be detected
757
 
758
  ### 🔧 Powered By
759
+ - **GLM-4.6V** - Text detection & translation (Z.ai API)
760
  - **LaMa** - Text removal inpainting (GPU-accelerated)
761
  """)
762
 
763
  gr.HTML("""
764
  <div style="text-align: center; margin-top: 20px; padding: 10px; background: rgba(0,0,0,0.05); border-radius: 8px;">
765
+ <strong>Models:</strong> <a href="https://huggingface.co/zai-org/GLM-4.6V" target="_blank">GLM-4.6V</a> (OCR & Translation) •
766
  <a href="https://github.com/advimman/lama" target="_blank">LaMa</a> (Inpainting) •
767
  <strong>Created by:</strong> <a href="https://huggingface.co/lulavc" target="_blank">@lulavc</a>
768
  </div>