Hi ifarkas,This feature is added in iClone 8.1, You can add the spoken word while creating a viseme clip.A new version will be released in the next few days, thanks.
clip_time = RLPy.IndexedFrameTime(0, RLPy.RGlobal.GetFps()) # create clip at frame 0
viseme_list = RLPy.RVisemeKeyVector()
for i in range(len(MSVisemeOffset)):
key = RLPy.RVisemeKey()
key.SetID(ms_mapping_dict[MSVisemeID[i]])
key.SetTime(RLPy.RTick.FromMilliSecond(int(MSVisemeOffset[i])))
key.SetWeight(1.0)
viseme_list.append(key)
word_list = RLPy.RWordDataVector()
word1 = RLPy.RWordData()
word1.m_kStart = RLPy.RTick.FromMilliSecond(int(MSVisemeOffset[0]))
word1.m_kEnd = RLPy.RTick.FromMilliSecond(int(MSVisemeOffset[10]))
word1.m_strText = "Hi"
word_list.append(word1)
word2 = RLPy.RWordData()
word2.m_kStart = RLPy.RTick.FromMilliSecond(int(MSVisemeOffset[10]))
word2.m_kEnd = RLPy.RTick.FromMilliSecond(int(MSVisemeOffset[20]))
word2.m_strText = "Maddie"
word_list.append(word2)
wav_path = dataset_path + "HiMaddie.wav"
audio_object = RLPy.RAudio.CreateAudioObject()
audio_object.Load(wav_path)
result = self.viseme_animator.AddVisemesClipWithData(clip_time, "VisemeClip", audio_object, viseme_list, word_list, "Hi! Maddie")