import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Load Falcon-7B-Instruct model with 4-bit quantization model_id = "tiiuae/falcon-7b-instruct" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, device_map="auto", load_in_4bit=True, trust_remote_code=True ) # Function to generate response def generate_response(prompt): inputs = tokenizer(prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu") outputs = model.generate( **inputs, max_length=100, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1 ) return tokenizer.decode(outputs[0], skip_special_tokens=True) # Gradio interface iface = gr.Interface( fn=generate_response, inputs=gr.Textbox(label="Enter your prompt", placeholder="Type here..."), outputs=gr.Textbox(label="Zephyrix Response"), title="Zephyrix - Pakistan's First AI Model", description="Built with love in Pakistan!" ) # Launch app iface.launch(server_name="0.0.0.0", server_port=7860)