curl --request POST \
--url https://{api_endpoint}/api/v1/voice_configurations \
--header 'Authorization: Bearer <token>' \
--header 'Content-Type: application/json' \
--data '
{
"name": "<string>",
"speech_to_text": {
"provider": "<string>",
"watson_stt_config": {
"api_url": "<string>",
"model": "<string>",
"api_key": "<string>",
"bearer_token": "<string>",
"end_of_phrase_silence_time": 123,
"background_audio_suppression": 0.5,
"language_customization_id": "<string>",
"inactivity_timeout": 0,
"profanity_filter": true,
"smart_formatting": true,
"speaker_labels": true,
"redaction": true,
"low_latency": true,
"learning_opt_out": true,
"watson_metadata": "<string>",
"smart_formatting_version": 1,
"customization_weight": 0.5,
"character_insertion_bias": 0
},
"emotech_stt_config": {
"api_url": "<string>",
"api_key": "<string>",
"positive_speech_threshold": 0.25,
"negative_speech_threshold": 0.25,
"partial_interval": 500,
"silence_threshold": 500
}
},
"text_to_speech": {
"provider": "<string>",
"watson_tts_config": {
"api_url": "<string>",
"voice": "<string>",
"api_key": "<string>",
"bearer_token": "<string>",
"rate_percentage": 0,
"pitch_percentage": 0,
"language": "<string>",
"customization_id": "<string>",
"meta_id": "<string>",
"learning_opt_out": true
},
"emotech_tts_config": {
"api_url": "<string>",
"api_key": "<string>",
"voice": "<string>"
},
"elevenlabs_tts_config": {
"model_id": "<string>",
"voice_id": "<string>",
"api_key": "<string>",
"apply_text_normalization": "<string>",
"language_code": "<string>",
"optimize_streaming_latency": 123,
"apply_language_text_normalization": true,
"pronunciation_dictionary_locators": [
{
"pronunciation_dictionary_id": "<string>",
"version_id": "<string>"
}
],
"seed": 123,
"previous_text": "<string>",
"next_text": "<string>",
"voice_settings": {
"speed": 1,
"style": 0,
"stability": 0.5,
"similarity_boost": 0.75,
"use_speaker_boost": true
}
}
},
"llm_aggregation_timeout_seconds": 0.8,
"language": "en-us",
"additional_languages": {},
"dtmf_input": {
"inter_digit_timeout_ms": 2500,
"termination_key": "<string>",
"maximum_count": 123,
"ignore_speech": true
},
"vad": {
"enabled": true,
"provider": "silero_vad",
"silero_vad_config": {
"confidence": 0.7,
"start_seconds": 0.2,
"stop_seconds": 0.8,
"min_volume": 0.6
}
}
}
'{
"detail": [
{
"loc": [
"<string>"
],
"msg": "<string>",
"type": "<string>"
}
]
}curl --request POST \
--url https://{api_endpoint}/api/v1/voice_configurations \
--header 'Authorization: Bearer <token>' \
--header 'Content-Type: application/json' \
--data '
{
"name": "<string>",
"speech_to_text": {
"provider": "<string>",
"watson_stt_config": {
"api_url": "<string>",
"model": "<string>",
"api_key": "<string>",
"bearer_token": "<string>",
"end_of_phrase_silence_time": 123,
"background_audio_suppression": 0.5,
"language_customization_id": "<string>",
"inactivity_timeout": 0,
"profanity_filter": true,
"smart_formatting": true,
"speaker_labels": true,
"redaction": true,
"low_latency": true,
"learning_opt_out": true,
"watson_metadata": "<string>",
"smart_formatting_version": 1,
"customization_weight": 0.5,
"character_insertion_bias": 0
},
"emotech_stt_config": {
"api_url": "<string>",
"api_key": "<string>",
"positive_speech_threshold": 0.25,
"negative_speech_threshold": 0.25,
"partial_interval": 500,
"silence_threshold": 500
}
},
"text_to_speech": {
"provider": "<string>",
"watson_tts_config": {
"api_url": "<string>",
"voice": "<string>",
"api_key": "<string>",
"bearer_token": "<string>",
"rate_percentage": 0,
"pitch_percentage": 0,
"language": "<string>",
"customization_id": "<string>",
"meta_id": "<string>",
"learning_opt_out": true
},
"emotech_tts_config": {
"api_url": "<string>",
"api_key": "<string>",
"voice": "<string>"
},
"elevenlabs_tts_config": {
"model_id": "<string>",
"voice_id": "<string>",
"api_key": "<string>",
"apply_text_normalization": "<string>",
"language_code": "<string>",
"optimize_streaming_latency": 123,
"apply_language_text_normalization": true,
"pronunciation_dictionary_locators": [
{
"pronunciation_dictionary_id": "<string>",
"version_id": "<string>"
}
],
"seed": 123,
"previous_text": "<string>",
"next_text": "<string>",
"voice_settings": {
"speed": 1,
"style": 0,
"stability": 0.5,
"similarity_boost": 0.75,
"use_speaker_boost": true
}
}
},
"llm_aggregation_timeout_seconds": 0.8,
"language": "en-us",
"additional_languages": {},
"dtmf_input": {
"inter_digit_timeout_ms": 2500,
"termination_key": "<string>",
"maximum_count": 123,
"ignore_speech": true
},
"vad": {
"enabled": true,
"provider": "silero_vad",
"silero_vad_config": {
"confidence": 0.7,
"start_seconds": 0.2,
"stop_seconds": 0.8,
"min_volume": 0.6
}
}
}
'{
"detail": [
{
"loc": [
"<string>"
],
"msg": "<string>",
"type": "<string>"
}
]
}Bearer authentication header of the form Bearer <token>, where <token> is your auth token.
1 - 128Show child attributes
1 - 128Show child attributes
1 - 20481 - 2561 - 20481 - 2048Background audio suppression level (0.0 to 1.0). Default 0.0
0 <= x <= 1Language customization ID
1 - 256Seconds of inactivity before the service stops listening. Default 30
x >= -1Filter profanity in the transcript. Default true
Enable smart formatting (beta). Default false
Enable speaker labels (beta). Default false
Enable PII redaction (beta). Default false
Enable low latency mode. Default false
Opt out of data collection for learning. Default true
Value for x-watson-metadata header.
1 - 512Version of smart formatting to use.
x >= 0Weight for custom language model (0.0 to 1.0). Default 0.5
0 <= x <= 1Bias for character insertion (-1.0 to 1.0). Default 0.0
-1 <= x <= 1Show child attributes
1 - 20481 - 2048Confidence threshold above which audio is classified as speech, default is 0.25
Confidence threshold below which audio is classified as non-speech, default is 0.25
Time interval (in ms) between partial transcription results, default is 500 ms.
Silence duration (in ms) after speech used to determine end of utterance, default is 1500 ms.
Show child attributes
1 - 128Show child attributes
1 - 20481 - 1281 - 20481 - 2048Rate percentage for speech synthesis, default is 0
Pitch percentage for speech synthesis, default is 0
Language code for the voice, e.g., 'en-US'
2 - 16Custom ID for the Watson TTS service
1 - 256Meta ID for the Watson TTS service
1 - 256Set to true to opt out of data collection for learning purposes
Show child attributes
The ID of the ElevenLabs model to use
1 - 128The ID of the ElevenLabs voice to use
1 - 128ElevenLabs API key
1 - 2048Whether to apply text normalization
Language code for the voice, e.g., 'en', 'es'
2 - 16Optimize streaming latency (0-4)
Whether to apply language-specific text normalization
List of pronunciation dictionary locators
Show child attributes
Seed for deterministic audio generation
Previous text for context
Next text for context
Voice settings for the ElevenLabs TTS
Show child attributes
Speech speed. Defaults to 1.0
Style exaggeration: the higher the value, the more computational resources are used. Defaults to 0.0
Stability: how stable the voice is and the randomness between each generation. Defaults to 0.5
Similarity boost: how closely the AI should adhere to the original voice. Defaults to 0.75
Whether to use speaker boost. Defaults to true
Maximum time to wait for additional transcription content before pushing aggregated result.
Default language code, e.g., 'en-us'
2 - 16Additional language configurations keyed by language code
Show child attributes
Voice configuration for a specific language
Show child attributes
Show child attributes
1 - 128Show child attributes
1 - 20481 - 1281 - 20481 - 2048Rate percentage for speech synthesis, default is 0
Pitch percentage for speech synthesis, default is 0
Language code for the voice, e.g., 'en-US'
2 - 16Custom ID for the Watson TTS service
1 - 256Meta ID for the Watson TTS service
1 - 256Set to true to opt out of data collection for learning purposes
Show child attributes
1 - 20481 - 20481 - 128Show child attributes
The ID of the ElevenLabs model to use
1 - 128The ID of the ElevenLabs voice to use
1 - 128ElevenLabs API key
1 - 2048Whether to apply text normalization
Language code for the voice, e.g., 'en', 'es'
2 - 16Optimize streaming latency (0-4)
Whether to apply language-specific text normalization
List of pronunciation dictionary locators
Show child attributes
ID of the pronunciation dictionary
Version ID of the pronunciation dictionary
Seed for deterministic audio generation
Previous text for context
Next text for context
Voice settings for the ElevenLabs TTS
Show child attributes
Speech speed. Defaults to 1.0
Style exaggeration: the higher the value, the more computational resources are used. Defaults to 0.0
Stability: how stable the voice is and the randomness between each generation. Defaults to 0.5
Similarity boost: how closely the AI should adhere to the original voice. Defaults to 0.75
Whether to use speaker boost. Defaults to true
Show child attributes
1 - 128Show child attributes
1 - 20481 - 2561 - 20481 - 2048Background audio suppression level (0.0 to 1.0). Default 0.0
0 <= x <= 1Language customization ID
1 - 256Seconds of inactivity before the service stops listening. Default 30
x >= -1Filter profanity in the transcript. Default true
Enable smart formatting (beta). Default false
Enable speaker labels (beta). Default false
Enable PII redaction (beta). Default false
Enable low latency mode. Default false
Opt out of data collection for learning. Default true
Value for x-watson-metadata header.
1 - 512Version of smart formatting to use.
x >= 0Weight for custom language model (0.0 to 1.0). Default 0.5
0 <= x <= 1Bias for character insertion (-1.0 to 1.0). Default 0.0
-1 <= x <= 1Show child attributes
1 - 20481 - 2048Confidence threshold above which audio is classified as speech, default is 0.25
Confidence threshold below which audio is classified as non-speech, default is 0.25
Time interval (in ms) between partial transcription results, default is 500 ms.
Silence duration (in ms) after speech used to determine end of utterance, default is 1500 ms.
Show child attributes
The amount of time (ms) to wait for a new DTMF digit, default is 2500 ms.
The DTMF termination key that signals the end of DTMF input.
Maximum number of digits a user can enter.
Disable speech recognition during collection of DTMF digits, default is true.
Show child attributes
Enable Voice Activity Detection, default is true.
1 - 128Show child attributes
The confidence threshold for speech detection (between 0.0 and 1.0), default is 0.7
The time in seconds speech must be detected before transitioning to SPEAKING state, default is 0.2
The time in seconds silence must be detected before transitioning to QUIET state, default is 0.8
The minimum audio volume threshold for speech detection (between 0.0 and 1.0), default is 0.6
Successful Response