ranggafermata's picture
Update backend/app.py
eafb42b verified
from flask import Flask, request, jsonify
from flask_cors import CORS
from PIL import Image
import torch
from transformers import AutoProcessor, BlipForConditionalGeneration
app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}})
device = "cuda" if torch.cuda.is_available() else "cpu"
vision_processor, vision_model = None, None
try:
vision_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
vision_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
print("--- VISION SERVICE --- BLIP Vision model loaded successfully.")
except Exception as e:
print(f"--- VISION SERVICE --- CRITICAL ERROR loading Vision model: {e}")
@app.route("/describe_image", methods=["POST"])
def describe_image():
if not vision_model:
return jsonify({"error": "Vision model not available."}), 500
user_prompt = request.form.get("prompt", "")
image_file = request.files.get("image")
if not image_file:
return jsonify({"error": "No image file found."}), 400
try:
image_obj = Image.open(image_file.stream).convert("RGB")
inputs = (vision_processor(images=image_obj, text=user_prompt, return_tensors="pt").to(device) if user_prompt else vision_processor(images=image_obj, return_tensors="pt").to(device))
output = vision_model.generate(**inputs, max_new_tokens=50)
caption = vision_processor.decode(output[0], skip_special_tokens=True).strip()
return jsonify({"content": caption})
except Exception as e:
print(f"Error processing image: {e}")
return jsonify({"error": "Sorry, I had trouble processing that image."}), 500
if __name__ == "__main__":
app.run(host="0.0.0.0", port=8081) # Use a different port for local testing