Spaces:

loleg
/

fastapi-apertus

Runtime error

App Files Files Community

Oleg Lavrovsky commited on Sep 13

Commit

b9acf2f

unverified ·

1 Parent(s): c46c72f

OpenAI type completions

Browse files

Files changed (1) hide show

app.py +82 -34

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from contextlib import asynccontextmanager
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
 from torch import cuda
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -40,6 +40,10 @@ class ModelResponse(BaseModel):
     confidence: float
     processing_time: float
 @asynccontextmanager
 async def lifespan(app: FastAPI):
@@ -88,6 +92,81 @@ app.add_middleware(
     allow_headers=["*"],
 )
 @app.get("/predict", response_model=ModelResponse)
 async def predict(q: str):
     """Generate a model response for input text"""
@@ -100,40 +179,9 @@ async def predict(q: str):
         input_data = TextInput(text=q)
-        # Truncate text if too long
-        text = input_data.text[:input_data.max_length]
-        if len(text) == input_data.max_length:
-            logger.warning("Warning: text truncated")
-        if len(text) < input_data.min_length:
-            logger.warning("Warning: empty text, aborting")
-            return None
-        # Prepare the model input
-        messages_think = [
-            {"role": "user", "content": text}
-        ]
-        text = tokenizer.apply_chat_template(
-            messages_think,
-            tokenize=False,
-            add_generation_prompt=True,
-            top_p=0.9,
-            temperature=0.8,
-        )
-        model_inputs = tokenizer(
-            [text],
-            return_tensors="pt",
-            add_special_tokens=False
-        ).to(model.device)
-        # Generate the output
-        generated_ids = model.generate(
-            **model_inputs,
-            max_new_tokens=512
-        )
-        # Get and decode the output
-        output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :]
-        result = tokenizer.decode(output_ids, skip_special_tokens=True)
         # Checkpoint
         processing_time = time.time() - start_time

 from contextlib import asynccontextmanager
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, ValidationError
 from torch import cuda
 from transformers import AutoModelForCausalLM, AutoTokenizer
     confidence: float
     processing_time: float
+class Completion(BaseModel):
+    model: str
+    prompt: str
+    max_tokens: int = 65536
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     allow_headers=["*"],
 )
+def fit_to_length(text, min_length=3, max_length=100):
+    """Truncate text if too long."""
+    text = text[:max_length]
+    if len(text) == max_length:
+        logger.warning("Warning: text truncated")
+    if len(text) < min_length:
+        logger.warning("Warning: empty text, aborting")
+        return None
+    return text
+def get_model_reponse(text: str):
+    """Process the text content."""
+    # Prepare the model input
+    messages_think = [
+        {"role": "user", "content": text}
+    ]
+    text = tokenizer.apply_chat_template(
+        messages_think,
+        tokenize=False,
+        add_generation_prompt=True,
+        top_p=0.9,
+        temperature=0.8,
+    )
+    model_inputs = tokenizer(
+        [text],
+        return_tensors="pt",
+        add_special_tokens=False
+    ).to(model.device)
+    # Generate the output
+    generated_ids = model.generate(
+        **model_inputs,
+        max_new_tokens=512
+    )
+    # Get and decode the output
+    output_ids = generated_ids[0][len(model_inputs.input_ids[0]) :]
+    # Return just the text
+    return tokenizer.decode(output_ids, skip_special_tokens=True)
+@app.post("/v1/models/apertus")
+async def completion(data: Completion):
+    """Generate an OpenAPI-style completion"""
+    if model is None or tokenizer is None:
+        raise HTTPException(status_code=503, detail="Model not loaded")
+    try:
+        text = fit_to_length(input_data.text, input_data.max_length)
+        result = get_model_reponse(text, model)
+        return {
+            "choices": [
+                {
+                    "text": result,
+                    "_index": 0,
+                    "logprobs": None,
+                    "finish_reason": "length"
+                }
+            ],
+            "usage": {
+                "prompt_tokens": len(text),
+                "completion_tokens": len(result),
+                "total_tokens": len(text) + len(result)
+            }
+        }
+    except ValidationError as e:
+        raise HTTPException(status_code=400, detail="Invalid input data") from e
 @app.get("/predict", response_model=ModelResponse)
 async def predict(q: str):
     """Generate a model response for input text"""
         input_data = TextInput(text=q)
+        text = fit_to_length(input_data.text, input_data.max_length)
+        result = get_model_reponse(text, model)
         # Checkpoint
         processing_time = time.time() - start_time