2525import soundfile as sf
2626import numpy as np
2727import contextlib
28- import intel_extension_for_pytorch as ipex
2928
3029from .utils .english_normalizer import EnglishNormalizer
3130
@@ -40,19 +39,18 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
4039 asset_path = "/intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/assets" ):
4140 """Make sure your export LD_PRELOAD=<path to libiomp5.so and libtcmalloc> beforehand."""
4241 # default setting
43- self .original_model = SpeechT5ForTextToSpeech .from_pretrained ("microsoft/speecht5_tts" )
44- self .processor = SpeechT5Processor .from_pretrained ("microsoft/speecht5_tts" )
4542 self .device = device
43+ self .original_model = SpeechT5ForTextToSpeech .from_pretrained ("microsoft/speecht5_tts" ).to (self .device )
44+ self .processor = SpeechT5Processor .from_pretrained ("microsoft/speecht5_tts" )
4645 self .voice = voice
4746 self .output_audio_path = output_audio_path
4847 self .stream_mode = stream_mode
4948 self .spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
5049 self .speaker_model = EncoderClassifier .from_hparams (
5150 source = self .spk_model_name ,
52- run_opts = {"device" : self .device },
53- savedir = os .path .join ("/tmp" , self .spk_model_name )
54- )
55- self .vocoder = SpeechT5HifiGan .from_pretrained ("microsoft/speecht5_hifigan" )
51+ run_opts = {"device" : "cpu" },
52+ savedir = os .path .join ("/tmp" , self .spk_model_name ))
53+ self .vocoder = SpeechT5HifiGan .from_pretrained ("microsoft/speecht5_hifigan" ).to (self .device )
5654 self .vocoder .eval ()
5755 script_dir = os .path .dirname (os .path .abspath (__file__ ))
5856 if os .path .exists (os .path .join (script_dir , '../../../assets/speaker_embeddings/spk_embed_default.pt' )):
@@ -77,15 +75,6 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
7775 elif os .path .exists (os .path .join (asset_path , 'speaker_embeddings/spk_embed_male.pt' )):
7876 self .male_speaker_embeddings = torch .load (os .path .join (asset_path , 'speaker_embeddings/spk_embed_male.pt' ))
7977
80- self .cpu_pool = None
81- if not torch .cuda .is_available ():
82- # ipex IOMP hardware resources
83- if 'LD_PRELOAD' in os .environ and 'libiomp' in os .environ ['LD_PRELOAD' ]:
84- import intel_extension_for_pytorch as ipex
85- self .cpu_pool = ipex .cpu .runtime .CPUPool ([i for i in range (24 )])
86- else :
87- print ("Warning! You have not preloaded iomp beforehand and that may lead to performance issue" )
88-
8978 self .normalizer = EnglishNormalizer ()
9079
9180 def create_speaker_embedding (self , driven_audio_path ):
@@ -97,10 +86,10 @@ def create_speaker_embedding(self, driven_audio_path):
9786 [driven_audio_path ]}).cast_column ("audio" , Audio (sampling_rate = 16000 ))
9887 waveform = audio_dataset [0 ]["audio" ]['array' ]
9988 with torch .no_grad ():
100- speaker_embeddings = self .speaker_model .encode_batch (torch .tensor (waveform ))
89+ speaker_embeddings = self .speaker_model .encode_batch (torch .tensor (waveform ). to ( "cpu" ) )
10190 speaker_embeddings = torch .nn .functional .normalize (speaker_embeddings , dim = 2 ) # [1,1,512]
10291 speaker_embeddings = speaker_embeddings [0 ] # [1,512]
103- return speaker_embeddings .cpu ( )
92+ return speaker_embeddings .to ( self . device )
10493
10594 def _lookup_voice_embedding (self , voice ,
10695 asset_path = "/intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/assets" ):
@@ -179,8 +168,8 @@ def text2speech(self, text, output_audio_path, voice="default", do_batch_tts=Fal
179168 for text_in in texts :
180169 inputs = self .processor (text = text_in , return_tensors = "pt" )
181170 with torch .no_grad ():
182- with ipex . cpu . runtime . pin ( self . cpu_pool ) if self . cpu_pool else contextlib . nullcontext ():
183- spectrogram = model . generate_speech ( inputs ["input_ids" ], speaker_embeddings )
171+ spectrogram = model . generate_speech (
172+ inputs ["input_ids" ]. to ( self . device ) , speaker_embeddings . to ( self . device ) )
184173 speech = self .vocoder (spectrogram )
185174 all_speech = np .concatenate ([all_speech , speech .cpu ().numpy ()])
186175 all_speech = np .concatenate ([all_speech , np .array ([0 for i in range (8000 )])]) # pad after each end
0 commit comments