"""
Voice to DTMF Converter 
Upload audio or record voice, get DTMF tones back
"""

import gradio as gr
import numpy as np
import librosa
from scipy.io import wavfile
import tempfile
import os

# Real DTMF digit frequency pairs
DTMF_DIGITS = {
    '1': (697, 1209), '2': (697, 1336), '3': (697, 1477),
    '4': (770, 1209), '5': (770, 1336), '6': (770, 1477),
    '7': (852, 1209), '8': (852, 1336), '9': (852, 1477),
    '*': (941, 1209), '0': (941, 1336), '#': (941, 1477),
}

def map_pitch_to_dtmf_digit(pitch_hz):
    """Map voice pitch to DTMF digit (distributes across vocal range)"""
    if pitch_hz < 100:
        return None
    elif pitch_hz < 150: return '*'
    elif pitch_hz < 200: return '1'
    elif pitch_hz < 250: return '4'  
    elif pitch_hz < 300: return '7'
    elif pitch_hz < 350: return '2'
    elif pitch_hz < 400: return '5'
    elif pitch_hz < 450: return '8'
    elif pitch_hz < 500: return '0'
    elif pitch_hz < 600: return '3'
    elif pitch_hz < 700: return '6'
    elif pitch_hz < 800: return '9'
    else: return '#'

def generate_dtmf_tone(digit, duration, sample_rate=44100):
    """Generate authentic dual-tone DTMF (two frequencies simultaneously)"""
    if digit not in DTMF_DIGITS:
        return np.zeros(int(sample_rate * duration))
    
    f1, f2 = DTMF_DIGITS[digit]
    t = np.linspace(0, duration, int(sample_rate * duration))
    
    # Generate both DTMF frequencies (this is what makes it sound like a phone!)
    tone = np.sin(2 * np.pi * f1 * t) + np.sin(2 * np.pi * f2 * t)
    tone = tone / 2  # Normalize
    
    # Sharp envelope for phone-like attack
    fade_samples = int(0.005 * sample_rate)  # 5ms fade
    envelope = np.ones_like(tone)
    envelope[:fade_samples] = np.linspace(0, 1, fade_samples)
    envelope[-fade_samples:] = np.linspace(1, 0, fade_samples)
    
    return tone * envelope * 0.5

def voice_to_dtmf(audio_input):
    """Convert voice to DTMF tones with melodic variation"""
    
    if audio_input is None:
        return None, "Please provide audio input"
    
    try:
        # Handle tuple input from gr.Audio (sample_rate, audio_data)
        if isinstance(audio_input, tuple):
            sr, y = audio_input
            # Convert to float if needed
            if y.dtype == np.int16:
                y = y.astype(np.float32) / 32768.0
        else:
            # Load from file
            y, sr = librosa.load(audio_input, sr=44100)
        
        # Resample if needed
        if sr != 44100:
            y = librosa.resample(y, orig_sr=sr, target_sr=44100)
            sr = 44100
        
        # Get pitch track
        pitches, magnitudes = librosa.piptrack(
            y=y, 
            sr=sr, 
            fmin=80, 
            fmax=1000,
            threshold=0.1
        )
        
        hop_length = 512
        frame_duration = hop_length / sr  # ~11ms per frame
        
        # Extract pitch contour
        pitch_contour = []
        for t in range(pitches.shape[1]):
            index = magnitudes[:, t].argmax()
            pitch = pitches[index, t]
            magnitude = magnitudes[index, t]
            
            if pitch > 100 and magnitude > 0.3:
                pitch_contour.append(pitch)
            else:
                pitch_contour.append(0)
        
        # LIGHTER smoothing for more melodic variation
        window = 8  # Reduced from 20 - allows faster pitch changes
        smoothed = []
        for i in range(len(pitch_contour)):
            start = max(0, i - window // 2)
            end = min(len(pitch_contour), i + window // 2 + 1)
            window_vals = [p for p in pitch_contour[start:end] if p > 0]
            if window_vals:
                smoothed.append(sorted(window_vals)[len(window_vals)//2])
            else:
                smoothed.append(0)
        
        # Map to digits
        digit_sequence = []
        for pitch in smoothed:
            if pitch > 100:
                digit_sequence.append(map_pitch_to_dtmf_digit(pitch))
            else:
                digit_sequence.append(None)  # Silence
        
        # GROUP with PITCH CHANGE DETECTION for melody
        MIN_TONE_FRAMES = 8  # Reduced from 20 - allows shorter notes (~88ms)
        MIN_SILENCE_FRAMES = 3  # Reduced from 5 - tighter gaps
        
        tone_regions = []
        current_digit = None
        region_start = 0
        region_length = 0
        
        for i, digit in enumerate(digit_sequence):
            # Check if digit changed (melodic variation)
            digit_changed = (digit != current_digit)
            
            if not digit_changed:
                # Continue current region
                region_length += 1
            else:
                # Digit changed - potential new note
                
                # Save previous region if long enough
                if current_digit and region_length >= MIN_TONE_FRAMES:
                    tone_regions.append((current_digit, region_start, region_start + region_length))
                elif current_digit is None and region_length >= MIN_SILENCE_FRAMES:
                    # Silence gap
                    tone_regions.append((None, region_start, region_start + region_length))
                
                # Start new region
                current_digit = digit
                region_start = i
                region_length = 1
        
        # Save final region
        if current_digit and region_length >= MIN_TONE_FRAMES:
            tone_regions.append((current_digit, region_start, region_start + region_length))
        elif current_digit is None and region_length >= MIN_SILENCE_FRAMES:
            tone_regions.append((None, region_start, region_start + region_length))
        
        # BUILD OUTPUT
        total_frames = len(digit_sequence)
        total_duration = total_frames * frame_duration
        dtmf_audio = np.zeros(int(total_duration * sr))
        
        sound_regions = 0
        silence_regions = 0
        unique_notes = set()
        
        for digit, start_frame, end_frame in tone_regions:
            if digit:  # Sound region
                sound_regions += 1
                unique_notes.add(digit)
                duration = (end_frame - start_frame) * frame_duration
                start_time = start_frame * frame_duration
                
                # Generate tone for this region
                tone = generate_dtmf_tone(digit, duration, sr)
                start_sample = int(start_time * sr)
                end_sample = start_sample + len(tone)
                
                if end_sample <= len(dtmf_audio):
                    dtmf_audio[start_sample:end_sample] = tone
            else:  # Silence region
                silence_regions += 1
        
        # Normalize
        if np.max(np.abs(dtmf_audio)) > 0:
            dtmf_audio = dtmf_audio / np.max(np.abs(dtmf_audio)) * 0.7
        
        # Save
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
            wavfile.write(tmp.name, sr, (dtmf_audio * 32767).astype(np.int16))
            output_path = tmp.name
        
        info = f"✓ Converted {len(y)/sr:.2f}s of audio\n"
        info += f"✓ {sound_regions} note regions ({len(unique_notes)} unique tones)\n"
        info += f"✓ {silence_regions} silence gaps\n"
        info += f"✓ Notes used: {', '.join(sorted(unique_notes))}"
        
        return output_path, info
        
    except Exception as e:
        return None, f"Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Voice to DTMF Converter") as demo:
    gr.Markdown("""
    # 📞 Voice to DTMF Converter

    """)
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                label="Input: Your Voice",
                sources=["microphone", "upload"],
                type="numpy"
            )
            convert_btn = gr.Button("🔄 Convert to DTMF", variant="primary")
            
        with gr.Column():
            audio_output = gr.Audio(label="Output: DTMF Tones")
            info_output = gr.Textbox(label="Info", lines=4)
    
    convert_btn.click(
        fn=voice_to_dtmf,
        inputs=audio_input,
        outputs=[audio_output, info_output]
    )
    
    gr.Markdown("""
    ---
 
    """)

if __name__ == "__main__":
    demo.launch()