Spaces:
Sleeping
Sleeping
| """ | |
| Voice to DTMF Converter | |
| Upload audio or record voice, get DTMF tones back | |
| """ | |
| import gradio as gr | |
| import numpy as np | |
| import librosa | |
| from scipy.io import wavfile | |
| import tempfile | |
| import os | |
| # Real DTMF digit frequency pairs | |
| DTMF_DIGITS = { | |
| '1': (697, 1209), '2': (697, 1336), '3': (697, 1477), | |
| '4': (770, 1209), '5': (770, 1336), '6': (770, 1477), | |
| '7': (852, 1209), '8': (852, 1336), '9': (852, 1477), | |
| '*': (941, 1209), '0': (941, 1336), '#': (941, 1477), | |
| } | |
| def map_pitch_to_dtmf_digit(pitch_hz): | |
| """Map voice pitch to DTMF digit (distributes across vocal range)""" | |
| if pitch_hz < 100: | |
| return None | |
| elif pitch_hz < 150: return '*' | |
| elif pitch_hz < 200: return '1' | |
| elif pitch_hz < 250: return '4' | |
| elif pitch_hz < 300: return '7' | |
| elif pitch_hz < 350: return '2' | |
| elif pitch_hz < 400: return '5' | |
| elif pitch_hz < 450: return '8' | |
| elif pitch_hz < 500: return '0' | |
| elif pitch_hz < 600: return '3' | |
| elif pitch_hz < 700: return '6' | |
| elif pitch_hz < 800: return '9' | |
| else: return '#' | |
| def generate_dtmf_tone(digit, duration, sample_rate=44100): | |
| """Generate authentic dual-tone DTMF (two frequencies simultaneously)""" | |
| if digit not in DTMF_DIGITS: | |
| return np.zeros(int(sample_rate * duration)) | |
| f1, f2 = DTMF_DIGITS[digit] | |
| t = np.linspace(0, duration, int(sample_rate * duration)) | |
| # Generate both DTMF frequencies (this is what makes it sound like a phone!) | |
| tone = np.sin(2 * np.pi * f1 * t) + np.sin(2 * np.pi * f2 * t) | |
| tone = tone / 2 # Normalize | |
| # Sharp envelope for phone-like attack | |
| fade_samples = int(0.005 * sample_rate) # 5ms fade | |
| envelope = np.ones_like(tone) | |
| envelope[:fade_samples] = np.linspace(0, 1, fade_samples) | |
| envelope[-fade_samples:] = np.linspace(1, 0, fade_samples) | |
| return tone * envelope * 0.5 | |
| def voice_to_dtmf(audio_input): | |
| """Convert voice to DTMF tones with melodic variation""" | |
| if audio_input is None: | |
| return None, "Please provide audio input" | |
| try: | |
| # Handle tuple input from gr.Audio (sample_rate, audio_data) | |
| if isinstance(audio_input, tuple): | |
| sr, y = audio_input | |
| # Convert to float if needed | |
| if y.dtype == np.int16: | |
| y = y.astype(np.float32) / 32768.0 | |
| else: | |
| # Load from file | |
| y, sr = librosa.load(audio_input, sr=44100) | |
| # Resample if needed | |
| if sr != 44100: | |
| y = librosa.resample(y, orig_sr=sr, target_sr=44100) | |
| sr = 44100 | |
| # Get pitch track | |
| pitches, magnitudes = librosa.piptrack( | |
| y=y, | |
| sr=sr, | |
| fmin=80, | |
| fmax=1000, | |
| threshold=0.1 | |
| ) | |
| hop_length = 512 | |
| frame_duration = hop_length / sr # ~11ms per frame | |
| # Extract pitch contour | |
| pitch_contour = [] | |
| for t in range(pitches.shape[1]): | |
| index = magnitudes[:, t].argmax() | |
| pitch = pitches[index, t] | |
| magnitude = magnitudes[index, t] | |
| if pitch > 100 and magnitude > 0.3: | |
| pitch_contour.append(pitch) | |
| else: | |
| pitch_contour.append(0) | |
| # LIGHTER smoothing for more melodic variation | |
| window = 8 # Reduced from 20 - allows faster pitch changes | |
| smoothed = [] | |
| for i in range(len(pitch_contour)): | |
| start = max(0, i - window // 2) | |
| end = min(len(pitch_contour), i + window // 2 + 1) | |
| window_vals = [p for p in pitch_contour[start:end] if p > 0] | |
| if window_vals: | |
| smoothed.append(sorted(window_vals)[len(window_vals)//2]) | |
| else: | |
| smoothed.append(0) | |
| # Map to digits | |
| digit_sequence = [] | |
| for pitch in smoothed: | |
| if pitch > 100: | |
| digit_sequence.append(map_pitch_to_dtmf_digit(pitch)) | |
| else: | |
| digit_sequence.append(None) # Silence | |
| # GROUP with PITCH CHANGE DETECTION for melody | |
| MIN_TONE_FRAMES = 8 # Reduced from 20 - allows shorter notes (~88ms) | |
| MIN_SILENCE_FRAMES = 3 # Reduced from 5 - tighter gaps | |
| tone_regions = [] | |
| current_digit = None | |
| region_start = 0 | |
| region_length = 0 | |
| for i, digit in enumerate(digit_sequence): | |
| # Check if digit changed (melodic variation) | |
| digit_changed = (digit != current_digit) | |
| if not digit_changed: | |
| # Continue current region | |
| region_length += 1 | |
| else: | |
| # Digit changed - potential new note | |
| # Save previous region if long enough | |
| if current_digit and region_length >= MIN_TONE_FRAMES: | |
| tone_regions.append((current_digit, region_start, region_start + region_length)) | |
| elif current_digit is None and region_length >= MIN_SILENCE_FRAMES: | |
| # Silence gap | |
| tone_regions.append((None, region_start, region_start + region_length)) | |
| # Start new region | |
| current_digit = digit | |
| region_start = i | |
| region_length = 1 | |
| # Save final region | |
| if current_digit and region_length >= MIN_TONE_FRAMES: | |
| tone_regions.append((current_digit, region_start, region_start + region_length)) | |
| elif current_digit is None and region_length >= MIN_SILENCE_FRAMES: | |
| tone_regions.append((None, region_start, region_start + region_length)) | |
| # BUILD OUTPUT | |
| total_frames = len(digit_sequence) | |
| total_duration = total_frames * frame_duration | |
| dtmf_audio = np.zeros(int(total_duration * sr)) | |
| sound_regions = 0 | |
| silence_regions = 0 | |
| unique_notes = set() | |
| for digit, start_frame, end_frame in tone_regions: | |
| if digit: # Sound region | |
| sound_regions += 1 | |
| unique_notes.add(digit) | |
| duration = (end_frame - start_frame) * frame_duration | |
| start_time = start_frame * frame_duration | |
| # Generate tone for this region | |
| tone = generate_dtmf_tone(digit, duration, sr) | |
| start_sample = int(start_time * sr) | |
| end_sample = start_sample + len(tone) | |
| if end_sample <= len(dtmf_audio): | |
| dtmf_audio[start_sample:end_sample] = tone | |
| else: # Silence region | |
| silence_regions += 1 | |
| # Normalize | |
| if np.max(np.abs(dtmf_audio)) > 0: | |
| dtmf_audio = dtmf_audio / np.max(np.abs(dtmf_audio)) * 0.7 | |
| # Save | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp: | |
| wavfile.write(tmp.name, sr, (dtmf_audio * 32767).astype(np.int16)) | |
| output_path = tmp.name | |
| info = f"β Converted {len(y)/sr:.2f}s of audio\n" | |
| info += f"β {sound_regions} note regions ({len(unique_notes)} unique tones)\n" | |
| info += f"β {silence_regions} silence gaps\n" | |
| info += f"β Notes used: {', '.join(sorted(unique_notes))}" | |
| return output_path, info | |
| except Exception as e: | |
| return None, f"Error: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Voice to DTMF Converter") as demo: | |
| gr.Markdown(""" | |
| # π Voice to DTMF Converter | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| label="Input: Your Voice", | |
| sources=["microphone", "upload"], | |
| type="numpy" | |
| ) | |
| convert_btn = gr.Button("π Convert to DTMF", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Output: DTMF Tones") | |
| info_output = gr.Textbox(label="Info", lines=4) | |
| convert_btn.click( | |
| fn=voice_to_dtmf, | |
| inputs=audio_input, | |
| outputs=[audio_output, info_output] | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() |