WebGLでの口パクが完成！

snackvirtual
2024年2月4日
読了時間: 6分

まずは完成動画を掲載

現在はClientの入力をそのままFeeがエコーバックするだけになっている

動きとしては

Clientからブラウザで文字入力
http GETで文字送信
Flask-Pythonが受信して、Feeの返信を作成（今回はエコーバック）
PythonからCeVIO AIにFeeの返信文字を投げ、Wavファイルと口パク用の音素データ（Phoneme）を作成
作成完了したらGETコマンド返信
WebGL側でGETの返信を受け、WavファイルのPlay準備のため2秒Wait
その後Playと同時に音素データから口の開閉と形のパラメータを再生時間を見ながらタイミングを合わせてセット

という制御になっている

よってWav再生と口パクは同期をとっているわけではない（同時スタートさせているだけ）

また口パクは滑らかにするように、4Tap程度のFIRフィルタを作ってある

見た目だが、なぜかUnity Editor上で動かすとスムーズにならないのだが、WebGLでBuildした画像を見るとスムーズになっている（気がするだけ？）

とりあえずこれで、基本的なパーツはいったん完成と言えるだろう

最初に考えたシステムよりは、はるかにスマートになっていると思うので、

全体としての資料は別途まとめたい

Unity：InputFieldManager

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.UI;
using UnityEngine.Networking;
using TMPro;
using UnityEngine.UIElements;


public class InputFieldManager : MonoBehaviour
{
    TMP_InputField inputField;
    TMP_Text text;
    string text0,text1,text2,text3;

    string ClientSentence = "";
    string FeeSentence = "";
    string Phoneme = "";
    public static string[] Phoneme_mat;
 
    void TextWindowinput(string inputtext)
    {
        text3 = text2;
        text2 = text1;
        text1 = text0;
        text0 = inputtext;
       
        text.text = text0 + "\r\n" + text1 + "\r\n" +  text2 + "\r\n" + text3;
    }

 
    void Start()
    {
        inputField = GameObject.Find("InputField (TMP)").GetComponent<TMP_InputField>();
        text = GameObject.Find("Text (TMP)").GetComponent<TMP_Text>();
        inputField.ActivateInputField();
        inputField.Select();
    }

    void Update()
    {
         if (!inputField.isFocused)
        {
            inputField.ActivateInputField();
            inputField.Select();
        }

        if (AudioPlayer.is_Playback_play && FeeSentence != ""){
            TextWindowinput(ClientSentence);
            TextWindowinput(FeeSentence);
            ClientSentence = "";
            FeeSentence = "";
        }
    }

    public void OnEndEdit()
    {
        //TextWindowinput("Client : " +  inputField.text);
        StartCoroutine(SendData(InputFieldManager_ID.ClientId, InputFieldManager_ID.ClientName, inputField.text));
        inputField.text = "";
    }

    IEnumerator SendData(string id, string name, string sentence)
    {

    // GET受信設定と返信
    // 通信フォーマット
    // ブラウザより送信（GET）  http:// .... ?ClientID=hide,ClientName=英夫,ClientSentence=今日はいい天気だねえ
    // ブラウザへの返信         ClientID=hide,ClientSentence=英夫：今日はいい天気だねえ,FeeSentence=Fee：ですね,Phoneme=(sil,0.0,0.005)(d,0.005,0.07)(e,0.07,0.165)(s,0.165,0.215)(U,0.215,0.27)(n,0.27,0.34)(e,0.34,0.5700000000000001)(sil,0.5700000000000001,0.715)
    // 自分のClientIDと返信のClientIDが一致していたら、自分が話者、そうでないときは聞き役

        string url = "http://127.0.0.1:8080/";
  
        UnityWebRequest request = UnityWebRequest.Get(
            url + "?SendData=ClientId=" + id + ",ClientName=" + name + ",ClientSentence=" + sentence);

        yield return request.SendWebRequest();

        if (request.result != UnityWebRequest.Result.Success)
        {
            Debug.Log(request.error);
        }
        else
        {   
            // 受信終了時の処理
            Debug.Log("Data sent successfully!");
            AudioPlayer.is_Playback_ready = true;
            string response = request.downloadHandler.text;
            Debug.Log(response);

            // Phonemeデータを分解して配列に入れる
            int startpos = response.IndexOf(",ClientSentence=")+(",ClientSentence=").Length;
            int endpos   = response.IndexOf(",FeeSentence=");
            ClientSentence = response.Substring(startpos,endpos - startpos);
            startpos = response.IndexOf(",FeeSentence=")+(",FeeSentence=").Length;
            endpos   = response.IndexOf(",Phoneme=");
            FeeSentence = response.Substring(startpos,endpos - startpos);
            startpos = response.IndexOf(",Phoneme=")+(",Phoneme=").Length;
            endpos   = response.Length;
            Phoneme = response.Substring(startpos,endpos - startpos);
            string Phoneme_t = Phoneme.Replace("(","");
            Phoneme_t = Phoneme_t.Replace(")",",");
            Phoneme_mat = Phoneme_t.Split(",");

            string x = "";
            for (int i=0; i<Phoneme_mat.Length; i++){
                x = x + " " + Phoneme_mat[i];
            }
            Debug.Log(x);

        }
    }
}

Unity：AudioPlayer

using System.Collections;
using System.Collections.Generic;
using JetBrains.Annotations;
using UnityEngine;
using UnityEngine.Networking;

public class AudioPlayer : MonoBehaviour
{
    public string url = "http://127.0.0.1:8080/Fee_voice/Fee_voice.wav";
    public static bool is_Playback_ready = false;
    public static bool is_Playback_play = false;

    private float startTime;
    private int index = 0;
    float Phoneme_elapsed;
    int mat_size;
    void Start()
    {        
  
    }

    void Update()
    {
        // Playback準備とwavファイルPlayをキック
        if (is_Playback_ready){
            is_Playback_ready = false;
            StartCoroutine(Playback());
        }
    
        // Playフラグで口パクスタート
        if (is_Playback_play){
            if (startTime == 0){
                startTime = Time.time;
                index = 0;
                Phoneme_elapsed = float.Parse(InputFieldManager.Phoneme_mat[1]);
                Fee_ParameterSet.Mouth = "n";
                mat_size = InputFieldManager.Phoneme_mat.Length;
                Debug.Log(mat_size);
            }else{

                float elapsedTime = Time.time - startTime;

                if (elapsedTime >= Phoneme_elapsed){
                    if ( mat_size-1 > index+2){
                        index += 2;
                        Phoneme_elapsed = float.Parse(InputFieldManager.Phoneme_mat[index+1]);
                        Fee_ParameterSet.Mouth = InputFieldManager.Phoneme_mat[index];

                    }else{
                        index = 0;
                        startTime = 0;
                        is_Playback_play = false;
                    }
                }
            }
        }    
    }

    IEnumerator Playback()
    {
        using (var uwr = UnityWebRequestMultimedia.GetAudioClip(url, AudioType.WAV))
        {
            yield return uwr.SendWebRequest();
            if (uwr.result == UnityWebRequest.Result.ConnectionError || uwr.result == UnityWebRequest.Result.ProtocolError)
            {
                Debug.LogError(uwr.error);
            }
            else
            {
                var audioClip = DownloadHandlerAudioClip.GetContent(uwr);
                var  audioSource = GetComponent<AudioSource>();
                audioSource.clip = audioClip;

                yield return new WaitForSeconds(2);

                is_Playback_play = true;
                Debug.Log("audioplayer.Play");
                audioSource.Play();
            }
        }
    }
}

Unity：Fee_parameterSet

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using Live2D.Cubism.Core;
using Live2D.Cubism.Framework;

using System.Net;
using System.Text;
using System.Threading;
using System;
using Unity.VisualScripting;

public class Fee_ParameterSet : MonoBehaviour
{
  
    // Fee 表情制御
    private CubismModel _model;
 
    private void Start()
    {
        // Fee表情制御
        _model = this.FindCubismModel();

    }
 

    public static string Mouth = "N";
    public float[] Mouth_form_delay = {0,0,0,0,0,0,0,0,0,0};
    public float[] Mouth_open_delay = {0,0,0,0,0,0,0,0,0,0};
    public string log = "";
    public bool log_flag = false;

    private void LateUpdate()
    {
   
        float[] Mouth_form = {1f,  1f, -1f,  0f,  0f,1f};
        float[] Mouth_open = {1f,0.3f,0.3f,0.6f,0.4f,0f};

        // FIR Filer
        float[] k = {1,1,1,1,0,0,0,0,0,0};   
    //    float[] k = {1,0,0,0,0,0,0,0,0,0};   

        Mouth_form_delay[9] = Mouth_form_delay[8];
        Mouth_form_delay[8] = Mouth_form_delay[7];
        Mouth_form_delay[7] = Mouth_form_delay[6];
        Mouth_form_delay[6] = Mouth_form_delay[5];
        Mouth_form_delay[5] = Mouth_form_delay[4];
        Mouth_form_delay[4] = Mouth_form_delay[3];
        Mouth_form_delay[3] = Mouth_form_delay[2];
        Mouth_form_delay[2] = Mouth_form_delay[1];
        Mouth_form_delay[1] = Mouth_form_delay[0];

        Mouth_open_delay[9] = Mouth_open_delay[8];
        Mouth_open_delay[8] = Mouth_open_delay[7];
        Mouth_open_delay[7] = Mouth_open_delay[6];
        Mouth_open_delay[6] = Mouth_open_delay[5];
        Mouth_open_delay[5] = Mouth_open_delay[4];
        Mouth_open_delay[4] = Mouth_open_delay[3];
        Mouth_open_delay[3] = Mouth_open_delay[2];
        Mouth_open_delay[2] = Mouth_open_delay[1];
        Mouth_open_delay[1] = Mouth_open_delay[0];

        if (Mouth == "a" || Mouth == "A"){
            Mouth_form_delay[0] = Mouth_form[0];
            Mouth_open_delay[0] = Mouth_open[0];
        }else if (Mouth == "i" || Mouth == "I"){
            Mouth_form_delay[0] = Mouth_form[1];
            Mouth_open_delay[0] = Mouth_open[1];
        }else if (Mouth == "u" || Mouth == "U"){
            Mouth_form_delay[0] = Mouth_form[2];
            Mouth_open_delay[0] = Mouth_open[2];
        }else if (Mouth == "e" || Mouth == "E"){
            Mouth_form_delay[0] = Mouth_form[3];
            Mouth_open_delay[0] = Mouth_open[3];
        }else if (Mouth == "o" || Mouth == "O"){
            Mouth_form_delay[0] = Mouth_form[4];
            Mouth_open_delay[0] = Mouth_open[4];
        }else if (Mouth == "n" || Mouth == "N"|| Mouth == "sil"){
            Mouth_form_delay[0] = Mouth_form[5];
            Mouth_open_delay[0] = Mouth_open[5];
        }

        int filter_tap = 0;
        for (int i=0; i<k.Length; i++){
            if (k[i] == 1){
                filter_tap ++;
            }
        }

        float Mouth_form_out = (Mouth_form_delay[9]*k[9] + Mouth_form_delay[8]*k[8] + Mouth_form_delay[7]*k[7] + Mouth_form_delay[6]*k[6] + Mouth_form_delay[5]*k[5] + Mouth_form_delay[4]*k[4] + Mouth_form_delay[3]*k[3] + Mouth_form_delay[2]*k[2] + Mouth_form_delay[1]*k[1] + Mouth_form_delay[0]*k[0] )/filter_tap;
        float Mouth_open_out = (Mouth_open_delay[9]*k[9] + Mouth_open_delay[8]*k[8] + Mouth_open_delay[7]*k[7] + Mouth_open_delay[6]*k[6] + Mouth_open_delay[5]*k[5] + Mouth_open_delay[4]*k[4] + Mouth_open_delay[3]*k[3] + Mouth_open_delay[2]*k[2] + Mouth_open_delay[1]*k[1] + Mouth_open_delay[0]*k[0] )/filter_tap;
        
        var parameter = _model.Parameters[14];
        parameter.BlendToValue(CubismParameterBlendMode.Override, Mouth_form_out);                        
        parameter = _model.Parameters[15];
        parameter.BlendToValue(CubismParameterBlendMode.Override, Mouth_open_out);                        

        // デバッグ用係数Pickup
           
        if ((Mouth_form_delay[0] != Mouth_form[5]) || (Mouth_open_delay[0] != Mouth_open[5])) {
            log += "," + Mouth_open_out;
            log_flag = true;
        }
        if ((Mouth_form_delay[0] == Mouth_form[5]) && (Mouth_open_delay[0] == Mouth_open[5]) && (log_flag == true)) {
            Debug.Log(log);
            log = "";
            log_flag = false;
        }

    }
  
}

Python：test.py

from flask import Flask, send_from_directory,request
from flask_cors import CORS
import CeVIO_control
import time

app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}})

recievedata = ""
senddata = ""

# GET受信設定と返信
# 通信フォーマット
# ブラウザより送信（GET）  http:// .... ?ClientID=hide,ClientName=英夫,ClientSentence=今日はいい天気だねえ
# ブラウザへの返信         ClientID=hide,ClientSentence=英夫：今日はいい天気だねえ,FeeSentence=Fee：ですね,Phoneme=(sil,0.0,0.005)(d,0.005,0.07)(e,0.07,0.165)(s,0.165,0.215)(U,0.215,0.27)(n,0.27,0.34)(e,0.34,0.5700000000000001)(sil,0.5700000000000001,0.715)
# 自分のClientIDと返信のClientIDが一致していたら、自分が話者、そうでないときは聞き役

@app.route('/')
def get():
    recievedata = request.args.get('SendData')
    print(recievedata)

    # 送信データ分解
    ClientId = recievedata[recievedata.find('ClientId=')+len('ClientId='):recievedata.find(',')]
    recievedata = recievedata[len('ClientId=')+len(ClientId)+1:]
    ClientName = recievedata[recievedata.find('ClientName=')+len('ClientName='):recievedata.find(',')]
    recievedata = recievedata[len('ClientName=')+len(ClientName)+1:]
    ClientSentence = recievedata[recievedata.find('ClientSentence=')+len('ClientSentence='):]
    print(ClientId,ClientName,ClientSentence)

    # ここにFeeの返答処理が入る
    FeeSentence = ClientSentence

    # CeVIO AI処理
    CeVIO_control.CeVIO_init()
    Phoneme = CeVIO_control.CeVIO_phoneme_data_get(FeeSentence)
    print(Phoneme)
    senddata = 'ClientId=' + ClientId \
        + ',ClientName=' + ClientName \
        + ',ClientSentence=' + ClientName + ' : ' + ClientSentence \
        + ',FeeSentence=' + 'Fee : ' + FeeSentence \
        + ',Phoneme=' + Phoneme


    CeVIO_control.CeVIO_wav_file_output(FeeSentence)


    return senddata
 
 

# wavファイルをアクセスできるように設定
@app.route("/Fee_voice/<path:filename>")
def play(filename):
    return send_from_directory("Fee_voice", filename)


if __name__ == '__main__':
    app.run(debug=True, host="0.0.0.0", port=8080)



# 注意：CeVIO AIはPythonを実行する前に起動させておくこと

Python：CeVIO_control.py

import win32com.client
service_control = win32com.client.Dispatch("CeVIO.Talk.RemoteService2.ServiceControl2V40")
talker2 = win32com.client.Dispatch("CeVIO.Talk.RemoteService2.Talker2V40")


def CeVIO_init():
    service_control.StartHost(False)

    #キャストを確認する
    string_array = talker2.AvailableCasts
    print([string_array.At(i) for i in range(string_array.Length)])

    #キャストを設定する
    talker2.Cast = "夏色花梨"

    #ボリュームを設定する
    talker2.Volume = 5

    #速度を設定する
    talker2.speed = 43

    #感情を確認する
    component_array = talker2.Components
    print([component_array.At(i).Name for i in range(component_array.Length)])
    CeVIO_emotion_set(100,0,0,0,0)
    return

def CeVIO_emotion_set(ureshii,futsuu,ikari,kanashimi,ochitsuki):
    #感情を設定する
    component_array = talker2.Components
    component_array.ByName("嬉しい").Value = ureshii
    component_array.ByName("普通").Value = futsuu
    component_array.ByName("怒り").Value = ikari
    component_array.ByName("哀しみ").Value = kanashimi
    component_array.ByName("落ち着き").Value = ochitsuki

# Feeの言葉をwavファイルに出力する
def CeVIO_wav_file_output(text):
    talker2.OutputWaveToFile(text, "C:\Snack_Virtual_2\Flask_Python\Fee_voice\Fee_voice.wav")
    print("Fee voice : " + text )
    return

def CeVIO_phoneme_data_get(text):
#音素のデータを取得する
    phoneme_data_array = talker2.GetPhonemes(text)
    result = ''
    for i in range(phoneme_data_array.Length):
         phoneme_data = phoneme_data_array.At(i)
         result = result + '(' + phoneme_data.Phoneme + ',' + str(phoneme_data.StartTime) + ')'

    return(result)

# if __name__ == '__main__':

#     CeVIO_init()
#     CeVIO_phoneme_data_get("ですね")

それにしてもC#とPythonを同時にやっていると頭がぐちゃぐちゃになる

文法とか頭が大混乱なので、なんでもかんでもBingでコード化してもらった

物凄い威力だ

WebGLでの口パクが完成！

最新記事

Comments