Emeritus-21's picture
Update app.py
ee164c1 verified
import os
import json
import base64
import time
import shutil
import uuid
import re
from pathlib import Path
from io import BytesIO
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import google.generativeai as genai
from gtts import gTTS
# --- SETUP ---
API_KEY = os.environ.get("GEMINI_API_KEY")
if not API_KEY:
raise RuntimeError("GEMINI_API_KEY not set in Secrets")
genai.configure(api_key=API_KEY)
app = FastAPI(
title="AgricFusion API v3 - Gemini 3",
description="Voice-to-voice agric advisor powered by Gemini 3 Flash",
version="3.0"
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
UPLOAD_DIR = Path("/tmp/agric_voice")
UPLOAD_DIR.mkdir(exist_ok=True)
# Map for gTTS (Note: Igbo and Yoruba support is limited in gTTS,
# using 'en' as a fallback or 'sw' for Swahili)
GTTS_LANG_MAP = {
"en": "en",
"yo": "en", # Fallback to en if gTTS fails for Yoruba
"ha": "en",
"ig": "en",
"sw": "sw",
}
SYSTEM_PROMPT = """
You are AgricFusion AI, an expert in tropical agriculture.
Task:
1. Detect spoken language (English, Yoruba, Hausa, Igbo, Swahili).
2. Transcribe exactly.
3. Give practical, organic advice.
4. Respond ONLY in the detected language.
Return STRICT JSON:
{
"transcription": "...",
"advice": "...",
"lang_code": "en|yo|ha|ig|sw"
}
"""
def clean_json_response(text):
"""Removes markdown backticks and extracts raw JSON."""
text = re.sub(r"```json\s*|```", "", text).strip()
return text
def wait_for_gemini_file(file_obj, max_wait=60):
waited = 0
while file_obj.state.name == "PROCESSING" and waited < max_wait:
time.sleep(2)
waited += 2
file_obj = genai.get_file(file_obj.name)
if file_obj.state.name != "ACTIVE":
raise Exception("Gemini audio processing failed")
@app.get("/")
def health_check():
return {"status": "online", "model": "gemini-3-flash-preview"}
@app.post("/agric-voice")
async def process_voice_to_voice(
audio: UploadFile = File(...),
language: str = Form(None)
):
file_id = str(uuid.uuid4())
suffix = Path(audio.filename).suffix if audio.filename else ".webm"
local_path = UPLOAD_DIR / f"{file_id}{suffix}"
gemini_file = None
try:
# 1. Save local file
with local_path.open("wb") as buffer:
shutil.copyfileobj(audio.file, buffer)
# 2. Upload to Gemini
gemini_file = genai.upload_file(path=str(local_path))
wait_for_gemini_file(gemini_file)
# 3. Initialize Gemini 3 Flash (Fastest for Voice)
model = genai.GenerativeModel(
model_name="gemini-3-flash-preview",
generation_config={
"response_mime_type": "application/json",
"temperature": 1.0,
}
)
lang_hint = f" Context: Farmer mentioned {language}." if language else ""
response = model.generate_content([gemini_file, SYSTEM_PROMPT + lang_hint])
# 4. Robust JSON Parsing
try:
res_data = json.loads(clean_json_response(response.text))
except Exception as e:
return JSONResponse(status_code=500, content={"error": "JSON Parse Error", "raw": response.text})
advice = res_data.get("advice", "No advice generated.")
lang_code = res_data.get("lang_code", "en").lower()
# 5. TTS Generation
tts_lang = GTTS_LANG_MAP.get(lang_code, "en")
tts = gTTS(text=advice, lang=tts_lang, slow=False)
audio_io = BytesIO()
tts.write_to_fp(audio_io)
audio_io.seek(0)
audio_base64 = base64.b64encode(audio_io.read()).decode("utf-8")
return {
"success": True,
"transcription": res_data.get("transcription"),
"advice": advice,
"detected_lang": lang_code,
"audio_base64": audio_base64,
"mime": "audio/mpeg"
}
except Exception as e:
return JSONResponse(status_code=500, content={"success": False, "error": str(e)})
finally:
# Cleanup
if local_path.exists():
os.remove(local_path)
if gemini_file:
genai.delete_file(gemini_file.name)