Code for How to Convert Text to Speech in Python Tutorial


View on Github

tts_google.py

import gtts
from playsound import playsound

# make request to google to get synthesis
tts = gtts.gTTS("Hello world")
# save the audio file
tts.save("hello.mp3")
# play the audio file
playsound("hello.mp3")

# in spanish
tts = gtts.gTTS("Hola Mundo", lang="es")
tts.save("hola.mp3")
playsound("hola.mp3")

# all available languages along with their IETF tag
print(gtts.lang.tts_langs())

tts_pyttsx3.py

import pyttsx3

# initialize Text-to-speech engine
engine = pyttsx3.init()

# convert this text to speech
text = "Python is a great programming language"
engine.say(text)
# play the speech
engine.runAndWait()

# get details of speaking rate
rate = engine.getProperty("rate")
print(rate)

# setting new voice rate (faster)
engine.setProperty("rate", 300)
engine.say(text)
engine.runAndWait()

# slower
engine.setProperty("rate", 100)
engine.say(text)
engine.runAndWait()

# get details of all voices available
voices = engine.getProperty("voices")
print(voices)
# set another voice
engine.setProperty("voice", voices[1].id)
engine.say(text)
engine.runAndWait()

# saving speech audio into a file
engine.save_to_file(text, "python.mp3")
engine.runAndWait()

tts_openai.py

from openai import OpenAI

# initialize the OpenAI API client
api_key = "YOUR_OPENAI_API_KEY"
client = OpenAI(api_key=api_key)

# sample text to generate speech from
text = """In his miracle year, he published four groundbreaking papers. 
These outlined the theory of the photoelectric effect, explained Brownian motion, 
introduced special relativity, and demonstrated mass-energy equivalence."""

# generate speech from the text
response = client.audio.speech.create(
    model="tts-1", # the model to use, there is tts-1 and tts-1-hd
    voice="nova", # the voice to use, there is alloy, echo, fable, onyx, nova, and shimmer
    input=text, # the text to generate speech from
    speed=1.0, # the speed of the generated speech, ranging from 0.25 to 4.0
)
# save the generated speech to a file
response.stream_to_file("openai-output.mp3")

tts_transformers.py

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import random
import string
import soundfile as sf

device = "cuda" if torch.cuda.is_available() else "cpu"
# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

# speaker ids from the embeddings dataset
speakers = {
    'awb': 0,     # Scottish male
    'bdl': 1138,  # US male
    'clb': 2271,  # US female
    'jmk': 3403,  # Canadian male
    'ksp': 4535,  # Indian male
    'rms': 5667,  # US male
    'slt': 6799   # US female
}

def save_text_to_speech(text, speaker=None):
    # preprocess text
    inputs = processor(text=text, return_tensors="pt").to(device)
    if speaker is not None:
        # load xvector containing speaker's voice characteristics from a dataset
        speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
    else:
        # random vector, meaning a random voice
        speaker_embeddings = torch.randn((1, 512)).to(device)
    # generate speech with the models
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
    if speaker is not None:
        # if we have a speaker, we use the speaker's ID in the filename
        output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
    else:
        # if we don't have a speaker, we use a random string in the filename
        random_str = ''.join(random.sample(string.ascii_letters+string.digits, k=5))
        output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
    # save the generated speech to a file with 16KHz sampling rate
    sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
    # return the filename for reference
    return output_filename

# generate speech with a US female voice
save_text_to_speech("Python is my favorite programming language", speaker=speakers["slt"])
# generate speech with a random voice
save_text_to_speech("Python is my favorite programming language")

# a challenging text with all speakers
text = """In his miracle year, he published four groundbreaking papers. 
These outlined the theory of the photoelectric effect, explained Brownian motion, 
introduced special relativity, and demonstrated mass-energy equivalence."""

for speaker_name, speaker in speakers.items():
    output_filename = save_text_to_speech(text, speaker)
    print(f"Saved {output_filename}")
# random speaker
output_filename = save_text_to_speech(text)
print(f"Saved {output_filename}")


pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy