Spaces:

netw1z
/

voice-to-dtmf

Sleeping

App Files Files Community

voice-to-dtmf / app.py

netw1z

Update app.py

abda5ab verified 19 days ago

raw

history blame contribute delete

8.34 kB

	"""
	Voice to DTMF Converter
	Upload audio or record voice, get DTMF tones back
	"""

	import gradio as gr
	import numpy as np
	import librosa
	from scipy.io import wavfile
	import tempfile
	import os

	# Real DTMF digit frequency pairs
	DTMF_DIGITS = {
	'1': (697, 1209), '2': (697, 1336), '3': (697, 1477),
	'4': (770, 1209), '5': (770, 1336), '6': (770, 1477),
	'7': (852, 1209), '8': (852, 1336), '9': (852, 1477),
	'*': (941, 1209), '0': (941, 1336), '#': (941, 1477),
	}

	def map_pitch_to_dtmf_digit(pitch_hz):
	"""Map voice pitch to DTMF digit (distributes across vocal range)"""
	if pitch_hz < 100:
	return None
	elif pitch_hz < 150: return '*'
	elif pitch_hz < 200: return '1'
	elif pitch_hz < 250: return '4'
	elif pitch_hz < 300: return '7'
	elif pitch_hz < 350: return '2'
	elif pitch_hz < 400: return '5'
	elif pitch_hz < 450: return '8'
	elif pitch_hz < 500: return '0'
	elif pitch_hz < 600: return '3'
	elif pitch_hz < 700: return '6'
	elif pitch_hz < 800: return '9'
	else: return '#'

	def generate_dtmf_tone(digit, duration, sample_rate=44100):
	"""Generate authentic dual-tone DTMF (two frequencies simultaneously)"""
	if digit not in DTMF_DIGITS:
	return np.zeros(int(sample_rate * duration))

	f1, f2 = DTMF_DIGITS[digit]
	t = np.linspace(0, duration, int(sample_rate * duration))

	# Generate both DTMF frequencies (this is what makes it sound like a phone!)
	tone = np.sin(2 * np.pi * f1 * t) + np.sin(2 * np.pi * f2 * t)
	tone = tone / 2 # Normalize

	# Sharp envelope for phone-like attack
	fade_samples = int(0.005 * sample_rate) # 5ms fade
	envelope = np.ones_like(tone)
	envelope[:fade_samples] = np.linspace(0, 1, fade_samples)
	envelope[-fade_samples:] = np.linspace(1, 0, fade_samples)

	return tone * envelope * 0.5

	def voice_to_dtmf(audio_input):
	"""Convert voice to DTMF tones with melodic variation"""

	if audio_input is None:
	return None, "Please provide audio input"

	try:
	# Handle tuple input from gr.Audio (sample_rate, audio_data)
	if isinstance(audio_input, tuple):
	sr, y = audio_input
	# Convert to float if needed
	if y.dtype == np.int16:
	y = y.astype(np.float32) / 32768.0
	else:
	# Load from file
	y, sr = librosa.load(audio_input, sr=44100)

	# Resample if needed
	if sr != 44100:
	y = librosa.resample(y, orig_sr=sr, target_sr=44100)
	sr = 44100

	# Get pitch track
	pitches, magnitudes = librosa.piptrack(
	y=y,
	sr=sr,
	fmin=80,
	fmax=1000,
	threshold=0.1
	)

	hop_length = 512
	frame_duration = hop_length / sr # ~11ms per frame

	# Extract pitch contour
	pitch_contour = []
	for t in range(pitches.shape[1]):
	index = magnitudes[:, t].argmax()
	pitch = pitches[index, t]
	magnitude = magnitudes[index, t]

	if pitch > 100 and magnitude > 0.3:
	pitch_contour.append(pitch)
	else:
	pitch_contour.append(0)

	# LIGHTER smoothing for more melodic variation
	window = 8 # Reduced from 20 - allows faster pitch changes
	smoothed = []
	for i in range(len(pitch_contour)):
	start = max(0, i - window // 2)
	end = min(len(pitch_contour), i + window // 2 + 1)
	window_vals = [p for p in pitch_contour[start:end] if p > 0]
	if window_vals:
	smoothed.append(sorted(window_vals)[len(window_vals)//2])
	else:
	smoothed.append(0)

	# Map to digits
	digit_sequence = []
	for pitch in smoothed:
	if pitch > 100:
	digit_sequence.append(map_pitch_to_dtmf_digit(pitch))
	else:
	digit_sequence.append(None) # Silence

	# GROUP with PITCH CHANGE DETECTION for melody
	MIN_TONE_FRAMES = 8 # Reduced from 20 - allows shorter notes (~88ms)
	MIN_SILENCE_FRAMES = 3 # Reduced from 5 - tighter gaps

	tone_regions = []
	current_digit = None
	region_start = 0
	region_length = 0

	for i, digit in enumerate(digit_sequence):
	# Check if digit changed (melodic variation)
	digit_changed = (digit != current_digit)

	if not digit_changed:
	# Continue current region
	region_length += 1
	else:
	# Digit changed - potential new note

	# Save previous region if long enough
	if current_digit and region_length >= MIN_TONE_FRAMES:
	tone_regions.append((current_digit, region_start, region_start + region_length))
	elif current_digit is None and region_length >= MIN_SILENCE_FRAMES:
	# Silence gap
	tone_regions.append((None, region_start, region_start + region_length))

	# Start new region
	current_digit = digit
	region_start = i
	region_length = 1

	# Save final region
	if current_digit and region_length >= MIN_TONE_FRAMES:
	tone_regions.append((current_digit, region_start, region_start + region_length))
	elif current_digit is None and region_length >= MIN_SILENCE_FRAMES:
	tone_regions.append((None, region_start, region_start + region_length))

	# BUILD OUTPUT
	total_frames = len(digit_sequence)
	total_duration = total_frames * frame_duration
	dtmf_audio = np.zeros(int(total_duration * sr))

	sound_regions = 0
	silence_regions = 0
	unique_notes = set()

	for digit, start_frame, end_frame in tone_regions:
	if digit: # Sound region
	sound_regions += 1
	unique_notes.add(digit)
	duration = (end_frame - start_frame) * frame_duration
	start_time = start_frame * frame_duration

	# Generate tone for this region
	tone = generate_dtmf_tone(digit, duration, sr)
	start_sample = int(start_time * sr)
	end_sample = start_sample + len(tone)

	if end_sample <= len(dtmf_audio):
	dtmf_audio[start_sample:end_sample] = tone
	else: # Silence region
	silence_regions += 1

	# Normalize
	if np.max(np.abs(dtmf_audio)) > 0:
	dtmf_audio = dtmf_audio / np.max(np.abs(dtmf_audio)) * 0.7

	# Save
	with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
	wavfile.write(tmp.name, sr, (dtmf_audio * 32767).astype(np.int16))
	output_path = tmp.name

	info = f"✓ Converted {len(y)/sr:.2f}s of audio\n"
	info += f"✓ {sound_regions} note regions ({len(unique_notes)} unique tones)\n"
	info += f"✓ {silence_regions} silence gaps\n"
	info += f"✓ Notes used: {', '.join(sorted(unique_notes))}"

	return output_path, info

	except Exception as e:
	return None, f"Error: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="Voice to DTMF Converter") as demo:
	gr.Markdown("""
	# 📞 Voice to DTMF Converter

	""")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	label="Input: Your Voice",
	sources=["microphone", "upload"],
	type="numpy"
	)
	convert_btn = gr.Button("🔄 Convert to DTMF", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(label="Output: DTMF Tones")
	info_output = gr.Textbox(label="Info", lines=4)

	convert_btn.click(
	fn=voice_to_dtmf,
	inputs=audio_input,
	outputs=[audio_output, info_output]
	)

	gr.Markdown("""
	---

	""")

	if __name__ == "__main__":
	demo.launch()