voice-to-dtmf / app.py
netw1z's picture
Update app.py
abda5ab verified
"""
Voice to DTMF Converter
Upload audio or record voice, get DTMF tones back
"""
import gradio as gr
import numpy as np
import librosa
from scipy.io import wavfile
import tempfile
import os
# Real DTMF digit frequency pairs
DTMF_DIGITS = {
'1': (697, 1209), '2': (697, 1336), '3': (697, 1477),
'4': (770, 1209), '5': (770, 1336), '6': (770, 1477),
'7': (852, 1209), '8': (852, 1336), '9': (852, 1477),
'*': (941, 1209), '0': (941, 1336), '#': (941, 1477),
}
def map_pitch_to_dtmf_digit(pitch_hz):
"""Map voice pitch to DTMF digit (distributes across vocal range)"""
if pitch_hz < 100:
return None
elif pitch_hz < 150: return '*'
elif pitch_hz < 200: return '1'
elif pitch_hz < 250: return '4'
elif pitch_hz < 300: return '7'
elif pitch_hz < 350: return '2'
elif pitch_hz < 400: return '5'
elif pitch_hz < 450: return '8'
elif pitch_hz < 500: return '0'
elif pitch_hz < 600: return '3'
elif pitch_hz < 700: return '6'
elif pitch_hz < 800: return '9'
else: return '#'
def generate_dtmf_tone(digit, duration, sample_rate=44100):
"""Generate authentic dual-tone DTMF (two frequencies simultaneously)"""
if digit not in DTMF_DIGITS:
return np.zeros(int(sample_rate * duration))
f1, f2 = DTMF_DIGITS[digit]
t = np.linspace(0, duration, int(sample_rate * duration))
# Generate both DTMF frequencies (this is what makes it sound like a phone!)
tone = np.sin(2 * np.pi * f1 * t) + np.sin(2 * np.pi * f2 * t)
tone = tone / 2 # Normalize
# Sharp envelope for phone-like attack
fade_samples = int(0.005 * sample_rate) # 5ms fade
envelope = np.ones_like(tone)
envelope[:fade_samples] = np.linspace(0, 1, fade_samples)
envelope[-fade_samples:] = np.linspace(1, 0, fade_samples)
return tone * envelope * 0.5
def voice_to_dtmf(audio_input):
"""Convert voice to DTMF tones with melodic variation"""
if audio_input is None:
return None, "Please provide audio input"
try:
# Handle tuple input from gr.Audio (sample_rate, audio_data)
if isinstance(audio_input, tuple):
sr, y = audio_input
# Convert to float if needed
if y.dtype == np.int16:
y = y.astype(np.float32) / 32768.0
else:
# Load from file
y, sr = librosa.load(audio_input, sr=44100)
# Resample if needed
if sr != 44100:
y = librosa.resample(y, orig_sr=sr, target_sr=44100)
sr = 44100
# Get pitch track
pitches, magnitudes = librosa.piptrack(
y=y,
sr=sr,
fmin=80,
fmax=1000,
threshold=0.1
)
hop_length = 512
frame_duration = hop_length / sr # ~11ms per frame
# Extract pitch contour
pitch_contour = []
for t in range(pitches.shape[1]):
index = magnitudes[:, t].argmax()
pitch = pitches[index, t]
magnitude = magnitudes[index, t]
if pitch > 100 and magnitude > 0.3:
pitch_contour.append(pitch)
else:
pitch_contour.append(0)
# LIGHTER smoothing for more melodic variation
window = 8 # Reduced from 20 - allows faster pitch changes
smoothed = []
for i in range(len(pitch_contour)):
start = max(0, i - window // 2)
end = min(len(pitch_contour), i + window // 2 + 1)
window_vals = [p for p in pitch_contour[start:end] if p > 0]
if window_vals:
smoothed.append(sorted(window_vals)[len(window_vals)//2])
else:
smoothed.append(0)
# Map to digits
digit_sequence = []
for pitch in smoothed:
if pitch > 100:
digit_sequence.append(map_pitch_to_dtmf_digit(pitch))
else:
digit_sequence.append(None) # Silence
# GROUP with PITCH CHANGE DETECTION for melody
MIN_TONE_FRAMES = 8 # Reduced from 20 - allows shorter notes (~88ms)
MIN_SILENCE_FRAMES = 3 # Reduced from 5 - tighter gaps
tone_regions = []
current_digit = None
region_start = 0
region_length = 0
for i, digit in enumerate(digit_sequence):
# Check if digit changed (melodic variation)
digit_changed = (digit != current_digit)
if not digit_changed:
# Continue current region
region_length += 1
else:
# Digit changed - potential new note
# Save previous region if long enough
if current_digit and region_length >= MIN_TONE_FRAMES:
tone_regions.append((current_digit, region_start, region_start + region_length))
elif current_digit is None and region_length >= MIN_SILENCE_FRAMES:
# Silence gap
tone_regions.append((None, region_start, region_start + region_length))
# Start new region
current_digit = digit
region_start = i
region_length = 1
# Save final region
if current_digit and region_length >= MIN_TONE_FRAMES:
tone_regions.append((current_digit, region_start, region_start + region_length))
elif current_digit is None and region_length >= MIN_SILENCE_FRAMES:
tone_regions.append((None, region_start, region_start + region_length))
# BUILD OUTPUT
total_frames = len(digit_sequence)
total_duration = total_frames * frame_duration
dtmf_audio = np.zeros(int(total_duration * sr))
sound_regions = 0
silence_regions = 0
unique_notes = set()
for digit, start_frame, end_frame in tone_regions:
if digit: # Sound region
sound_regions += 1
unique_notes.add(digit)
duration = (end_frame - start_frame) * frame_duration
start_time = start_frame * frame_duration
# Generate tone for this region
tone = generate_dtmf_tone(digit, duration, sr)
start_sample = int(start_time * sr)
end_sample = start_sample + len(tone)
if end_sample <= len(dtmf_audio):
dtmf_audio[start_sample:end_sample] = tone
else: # Silence region
silence_regions += 1
# Normalize
if np.max(np.abs(dtmf_audio)) > 0:
dtmf_audio = dtmf_audio / np.max(np.abs(dtmf_audio)) * 0.7
# Save
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
wavfile.write(tmp.name, sr, (dtmf_audio * 32767).astype(np.int16))
output_path = tmp.name
info = f"βœ“ Converted {len(y)/sr:.2f}s of audio\n"
info += f"βœ“ {sound_regions} note regions ({len(unique_notes)} unique tones)\n"
info += f"βœ“ {silence_regions} silence gaps\n"
info += f"βœ“ Notes used: {', '.join(sorted(unique_notes))}"
return output_path, info
except Exception as e:
return None, f"Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Voice to DTMF Converter") as demo:
gr.Markdown("""
# πŸ“ž Voice to DTMF Converter
""")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="Input: Your Voice",
sources=["microphone", "upload"],
type="numpy"
)
convert_btn = gr.Button("πŸ”„ Convert to DTMF", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Output: DTMF Tones")
info_output = gr.Textbox(label="Info", lines=4)
convert_btn.click(
fn=voice_to_dtmf,
inputs=audio_input,
outputs=[audio_output, info_output]
)
gr.Markdown("""
---
""")
if __name__ == "__main__":
demo.launch()