Fix bugs
Browse files
app.py
CHANGED
|
@@ -136,17 +136,47 @@ def handle_action(openai_key, image, prompt):
|
|
| 136 |
|
| 137 |
image_description = run_wrapper(original_image_path, temp_dir, skip_ocr=False, skip_spell=True, json_mini=True)
|
| 138 |
|
|
|
|
|
|
|
| 139 |
with open(yolo_updated_image_path, "rb") as f:
|
| 140 |
yolo_updated_img_bytes = f.read()
|
| 141 |
|
| 142 |
_, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=2000, scale=0.5, fmt="png")
|
| 143 |
|
| 144 |
base64_image_url = f"data:image/png;base64,{new_b64}"
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
-
messages = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
response = llm_client.chat.completions.create(model="gpt-4.1", messages=messages, temperature=0.2)
|
| 152 |
return response.choices[0].message.content.strip()
|
|
@@ -164,7 +194,7 @@ def handle_analyze(image, output_style):
|
|
| 164 |
image_path = os.path.join(temp_dir, "image_to_analyze.png")
|
| 165 |
save_base64_image(image_b64, image_path)
|
| 166 |
|
| 167 |
-
is_mini = (output_style == "
|
| 168 |
description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
|
| 169 |
|
| 170 |
parsed_json = json.loads(description_str)
|
|
@@ -185,7 +215,7 @@ def handle_analyze_yolo(image, output_style):
|
|
| 185 |
yolo_image_path = os.path.join(temp_dir, f"{request_id}_yolo_updated.png")
|
| 186 |
save_base64_image(image_b64, image_path)
|
| 187 |
|
| 188 |
-
is_mini = (output_style == "
|
| 189 |
description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
|
| 190 |
|
| 191 |
parsed_json = json.loads(description_str)
|
|
@@ -215,15 +245,20 @@ def handle_generate(openai_key, image, prompt):
|
|
| 215 |
|
| 216 |
image_description = run_wrapper(image_path=original_image_path, output_dir=temp_dir, json_mini=False)
|
| 217 |
|
|
|
|
|
|
|
| 218 |
with open(yolo_updated_image_path, "rb") as f:
|
| 219 |
yolo_updated_img_bytes = f.read()
|
| 220 |
|
| 221 |
_, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=1500, scale=0.5, fmt="png")
|
| 222 |
|
| 223 |
base64_image_url = f"data:image/png;base64,{new_b64}"
|
|
|
|
|
|
|
|
|
|
| 224 |
messages = [
|
| 225 |
{"role": "user", "content": [
|
| 226 |
-
{"type": "text", "text":
|
| 227 |
{"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}
|
| 228 |
]}
|
| 229 |
]
|
|
@@ -272,7 +307,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 272 |
with gr.Row():
|
| 273 |
image_input_analyze = gr.Image(type="pil", label="Upload Screen Image")
|
| 274 |
with gr.Column():
|
| 275 |
-
output_style_analyze = gr.Radio(["Standard JSON", "
|
| 276 |
analyze_button = gr.Button("Analyze Image", variant="primary")
|
| 277 |
analyze_output = gr.JSON(label="JSON Description")
|
| 278 |
with gr.Row():
|
|
@@ -284,7 +319,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 284 |
with gr.Row():
|
| 285 |
image_input_yolo = gr.Image(type="pil", label="Upload Screen Image")
|
| 286 |
with gr.Column():
|
| 287 |
-
output_style_yolo = gr.Radio(["Standard JSON", "
|
| 288 |
yolo_button = gr.Button("Analyze and Visualize", variant="primary")
|
| 289 |
with gr.Row():
|
| 290 |
yolo_image_output = gr.Image(label="YOLO Annotated Image")
|
|
|
|
| 136 |
|
| 137 |
image_description = run_wrapper(original_image_path, temp_dir, skip_ocr=False, skip_spell=True, json_mini=True)
|
| 138 |
|
| 139 |
+
if not os.path.exists(yolo_updated_image_path):
|
| 140 |
+
raise FileNotFoundError(f"YOLO updated image not found at {yolo_updated_image_path}")
|
| 141 |
with open(yolo_updated_image_path, "rb") as f:
|
| 142 |
yolo_updated_img_bytes = f.read()
|
| 143 |
|
| 144 |
_, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=2000, scale=0.5, fmt="png")
|
| 145 |
|
| 146 |
base64_image_url = f"data:image/png;base64,{new_b64}"
|
| 147 |
+
|
| 148 |
+
prompt_text = f"""You are an AI agent that controls a mobile device and sees the content of screen.
|
| 149 |
+
User can ask you about some information or to do some task and you need to do these tasks.
|
| 150 |
+
You can only respond with one of these commands (in quotes) but some variables are dynamic
|
| 151 |
+
and can be changed based on the context:
|
| 152 |
+
1. "Swipe left. From start coordinates 300, 400" (or other coordinates) (Goes right)
|
| 153 |
+
2. "Swipe right. From start coordinates 500, 650" (or other coordinates) (Goes left)
|
| 154 |
+
3. "Swipe top. From start coordinates 600, 510" (or other coordinates) (Goes bottom)
|
| 155 |
+
4. "Swipe bottom. From start coordinates 640, 500" (or other coordinates) (Goes top)
|
| 156 |
+
5. "Go home"
|
| 157 |
+
6. "Go back"
|
| 158 |
+
8. "Open com.whatsapp" (or other app)
|
| 159 |
+
9. "Tap coordinates 160, 820" (or other coordinates)
|
| 160 |
+
10. "Insert text 210, 820:Hello world" (or other coordinates and text)
|
| 161 |
+
11. "Screen is in a loading state. Try again" (send image again)
|
| 162 |
+
12. "Answer: There are no new important mails today" (or other answer)
|
| 163 |
+
13. "Finished" (task is finished)
|
| 164 |
+
14. "Can't proceed" (can't understand what to do or image has problem etc.)
|
| 165 |
+
|
| 166 |
+
The user said: "{prompt}"
|
| 167 |
+
|
| 168 |
+
I will share the screenshot of the current state of the phone (with UI elements highlighted and the corresponding
|
| 169 |
+
index of these UI elements) and the description (sizes, coordinates and indexes) of UI elements.
|
| 170 |
+
Description:
|
| 171 |
+
"{image_description}" """
|
| 172 |
|
| 173 |
+
messages = [
|
| 174 |
+
{"role": "user", "content": [
|
| 175 |
+
{"type": "text", "text": prompt_text},
|
| 176 |
+
# We are correctly sending the YOLO-annotated image here
|
| 177 |
+
{"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}
|
| 178 |
+
]}
|
| 179 |
+
]
|
| 180 |
|
| 181 |
response = llm_client.chat.completions.create(model="gpt-4.1", messages=messages, temperature=0.2)
|
| 182 |
return response.choices[0].message.content.strip()
|
|
|
|
| 194 |
image_path = os.path.join(temp_dir, "image_to_analyze.png")
|
| 195 |
save_base64_image(image_b64, image_path)
|
| 196 |
|
| 197 |
+
is_mini = (output_style == "mini JSON")
|
| 198 |
description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
|
| 199 |
|
| 200 |
parsed_json = json.loads(description_str)
|
|
|
|
| 215 |
yolo_image_path = os.path.join(temp_dir, f"{request_id}_yolo_updated.png")
|
| 216 |
save_base64_image(image_b64, image_path)
|
| 217 |
|
| 218 |
+
is_mini = (output_style == "mini JSON")
|
| 219 |
description_str = run_wrapper(image_path=image_path, output_dir=temp_dir, json_mini=is_mini)
|
| 220 |
|
| 221 |
parsed_json = json.loads(description_str)
|
|
|
|
| 245 |
|
| 246 |
image_description = run_wrapper(image_path=original_image_path, output_dir=temp_dir, json_mini=False)
|
| 247 |
|
| 248 |
+
if not os.path.exists(yolo_updated_image_path):
|
| 249 |
+
raise FileNotFoundError(f"YOLO updated image not found at {yolo_updated_image_path}")
|
| 250 |
with open(yolo_updated_image_path, "rb") as f:
|
| 251 |
yolo_updated_img_bytes = f.read()
|
| 252 |
|
| 253 |
_, new_b64 = preprocess_image(yolo_updated_img_bytes, threshold=1500, scale=0.5, fmt="png")
|
| 254 |
|
| 255 |
base64_image_url = f"data:image/png;base64,{new_b64}"
|
| 256 |
+
|
| 257 |
+
prompt_text = f'"Prompt: {prompt}"\nImage description:\n"{image_description}"'
|
| 258 |
+
|
| 259 |
messages = [
|
| 260 |
{"role": "user", "content": [
|
| 261 |
+
{"type": "text", "text": prompt_text},
|
| 262 |
{"type": "image_url", "image_url": {"url": base64_image_url, "detail": "high"}}
|
| 263 |
]}
|
| 264 |
]
|
|
|
|
| 307 |
with gr.Row():
|
| 308 |
image_input_analyze = gr.Image(type="pil", label="Upload Screen Image")
|
| 309 |
with gr.Column():
|
| 310 |
+
output_style_analyze = gr.Radio(["Standard JSON", "mini JSON"], label="Output Format", value="Standard JSON")
|
| 311 |
analyze_button = gr.Button("Analyze Image", variant="primary")
|
| 312 |
analyze_output = gr.JSON(label="JSON Description")
|
| 313 |
with gr.Row():
|
|
|
|
| 319 |
with gr.Row():
|
| 320 |
image_input_yolo = gr.Image(type="pil", label="Upload Screen Image")
|
| 321 |
with gr.Column():
|
| 322 |
+
output_style_yolo = gr.Radio(["Standard JSON", "mini JSON"], label="Output Format", value="Standard JSON")
|
| 323 |
yolo_button = gr.Button("Analyze and Visualize", variant="primary")
|
| 324 |
with gr.Row():
|
| 325 |
yolo_image_output = gr.Image(label="YOLO Annotated Image")
|