""" Voice to DTMF Converter Upload audio or record voice, get DTMF tones back """ import gradio as gr import numpy as np import librosa from scipy.io import wavfile import tempfile import os # Real DTMF digit frequency pairs DTMF_DIGITS = { '1': (697, 1209), '2': (697, 1336), '3': (697, 1477), '4': (770, 1209), '5': (770, 1336), '6': (770, 1477), '7': (852, 1209), '8': (852, 1336), '9': (852, 1477), '*': (941, 1209), '0': (941, 1336), '#': (941, 1477), } def map_pitch_to_dtmf_digit(pitch_hz): """Map voice pitch to DTMF digit (distributes across vocal range)""" if pitch_hz < 100: return None elif pitch_hz < 150: return '*' elif pitch_hz < 200: return '1' elif pitch_hz < 250: return '4' elif pitch_hz < 300: return '7' elif pitch_hz < 350: return '2' elif pitch_hz < 400: return '5' elif pitch_hz < 450: return '8' elif pitch_hz < 500: return '0' elif pitch_hz < 600: return '3' elif pitch_hz < 700: return '6' elif pitch_hz < 800: return '9' else: return '#' def generate_dtmf_tone(digit, duration, sample_rate=44100): """Generate authentic dual-tone DTMF (two frequencies simultaneously)""" if digit not in DTMF_DIGITS: return np.zeros(int(sample_rate * duration)) f1, f2 = DTMF_DIGITS[digit] t = np.linspace(0, duration, int(sample_rate * duration)) # Generate both DTMF frequencies (this is what makes it sound like a phone!) tone = np.sin(2 * np.pi * f1 * t) + np.sin(2 * np.pi * f2 * t) tone = tone / 2 # Normalize # Sharp envelope for phone-like attack fade_samples = int(0.005 * sample_rate) # 5ms fade envelope = np.ones_like(tone) envelope[:fade_samples] = np.linspace(0, 1, fade_samples) envelope[-fade_samples:] = np.linspace(1, 0, fade_samples) return tone * envelope * 0.5 def voice_to_dtmf(audio_input): """Convert voice to DTMF tones with melodic variation""" if audio_input is None: return None, "Please provide audio input" try: # Handle tuple input from gr.Audio (sample_rate, audio_data) if isinstance(audio_input, tuple): sr, y = audio_input # Convert to float if needed if y.dtype == np.int16: y = y.astype(np.float32) / 32768.0 else: # Load from file y, sr = librosa.load(audio_input, sr=44100) # Resample if needed if sr != 44100: y = librosa.resample(y, orig_sr=sr, target_sr=44100) sr = 44100 # Get pitch track pitches, magnitudes = librosa.piptrack( y=y, sr=sr, fmin=80, fmax=1000, threshold=0.1 ) hop_length = 512 frame_duration = hop_length / sr # ~11ms per frame # Extract pitch contour pitch_contour = [] for t in range(pitches.shape[1]): index = magnitudes[:, t].argmax() pitch = pitches[index, t] magnitude = magnitudes[index, t] if pitch > 100 and magnitude > 0.3: pitch_contour.append(pitch) else: pitch_contour.append(0) # LIGHTER smoothing for more melodic variation window = 8 # Reduced from 20 - allows faster pitch changes smoothed = [] for i in range(len(pitch_contour)): start = max(0, i - window // 2) end = min(len(pitch_contour), i + window // 2 + 1) window_vals = [p for p in pitch_contour[start:end] if p > 0] if window_vals: smoothed.append(sorted(window_vals)[len(window_vals)//2]) else: smoothed.append(0) # Map to digits digit_sequence = [] for pitch in smoothed: if pitch > 100: digit_sequence.append(map_pitch_to_dtmf_digit(pitch)) else: digit_sequence.append(None) # Silence # GROUP with PITCH CHANGE DETECTION for melody MIN_TONE_FRAMES = 8 # Reduced from 20 - allows shorter notes (~88ms) MIN_SILENCE_FRAMES = 3 # Reduced from 5 - tighter gaps tone_regions = [] current_digit = None region_start = 0 region_length = 0 for i, digit in enumerate(digit_sequence): # Check if digit changed (melodic variation) digit_changed = (digit != current_digit) if not digit_changed: # Continue current region region_length += 1 else: # Digit changed - potential new note # Save previous region if long enough if current_digit and region_length >= MIN_TONE_FRAMES: tone_regions.append((current_digit, region_start, region_start + region_length)) elif current_digit is None and region_length >= MIN_SILENCE_FRAMES: # Silence gap tone_regions.append((None, region_start, region_start + region_length)) # Start new region current_digit = digit region_start = i region_length = 1 # Save final region if current_digit and region_length >= MIN_TONE_FRAMES: tone_regions.append((current_digit, region_start, region_start + region_length)) elif current_digit is None and region_length >= MIN_SILENCE_FRAMES: tone_regions.append((None, region_start, region_start + region_length)) # BUILD OUTPUT total_frames = len(digit_sequence) total_duration = total_frames * frame_duration dtmf_audio = np.zeros(int(total_duration * sr)) sound_regions = 0 silence_regions = 0 unique_notes = set() for digit, start_frame, end_frame in tone_regions: if digit: # Sound region sound_regions += 1 unique_notes.add(digit) duration = (end_frame - start_frame) * frame_duration start_time = start_frame * frame_duration # Generate tone for this region tone = generate_dtmf_tone(digit, duration, sr) start_sample = int(start_time * sr) end_sample = start_sample + len(tone) if end_sample <= len(dtmf_audio): dtmf_audio[start_sample:end_sample] = tone else: # Silence region silence_regions += 1 # Normalize if np.max(np.abs(dtmf_audio)) > 0: dtmf_audio = dtmf_audio / np.max(np.abs(dtmf_audio)) * 0.7 # Save with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp: wavfile.write(tmp.name, sr, (dtmf_audio * 32767).astype(np.int16)) output_path = tmp.name info = f"✓ Converted {len(y)/sr:.2f}s of audio\n" info += f"✓ {sound_regions} note regions ({len(unique_notes)} unique tones)\n" info += f"✓ {silence_regions} silence gaps\n" info += f"✓ Notes used: {', '.join(sorted(unique_notes))}" return output_path, info except Exception as e: return None, f"Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="Voice to DTMF Converter") as demo: gr.Markdown(""" # 📞 Voice to DTMF Converter """) with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="Input: Your Voice", sources=["microphone", "upload"], type="numpy" ) convert_btn = gr.Button("🔄 Convert to DTMF", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Output: DTMF Tones") info_output = gr.Textbox(label="Info", lines=4) convert_btn.click( fn=voice_to_dtmf, inputs=audio_input, outputs=[audio_output, info_output] ) gr.Markdown(""" --- """) if __name__ == "__main__": demo.launch()