feat: Text to speech support streaming playback (#2661)

2025-03-24 14:21:29 +08:00 · 2025-03-24 14:21:29 +08:00 · dcee1b6d55
commit dcee1b6d55
parent 0ce6dd0795
4 changed files with 343 additions and 136 deletions
--- a/ui/src/api/type/application.ts
+++ b/ui/src/api/type/application.ts
@ -1,5 +1,6 @@
 import { type Dict } from '@/api/type/common'
 import { type Ref } from 'vue'
 import bus from '@/bus'
 interface ApplicationFormType {
  name?: string
  desc?: string
@ -144,8 +145,8 @@ export class ChatRecordManage {
        })
      }
    }
    this.chat.answer_text = this.chat.answer_text + chunk_answer
    bus.emit('change:answer', { record_id: this.chat.record_id, is_end: false })
  }
  get_current_up_node(run_node: any) {
    const index = this.node_list.findIndex((item) => item == run_node)
@ -232,6 +233,7 @@ export class ChatRecordManage {
    if (this.loading) {
      this.loading.value = false
    }
    bus.emit('change:answer', { record_id: this.chat.record_id, is_end: true })
    if (this.id) {
      clearInterval(this.id)
    }
--- a/ui/src/components/ai-chat/component/operation-button/ChatOperationButton.vue
+++ b/ui/src/components/ai-chat/component/operation-button/ChatOperationButton.vue
@ -8,20 +8,35 @@
      <!-- 语音播放 -->
      <span v-if="tts">
        <el-tooltip
          v-if="audioManage?.isPlaying()"
          effect="dark"
-          :content="$t('chat.operation.play')"
+          :content="$t('chat.operation.pause')"
          placement="top"
          v-if="!audioPlayerStatus"
        >
-          <el-button text :disabled="!data?.write_ed" @click="playAnswerText(data?.answer_text)">
+          <el-button
-            <AppIcon iconName="app-video-play"></AppIcon>
+            type="primary"
-          </el-button>
+            text
-        </el-tooltip>
+            :disabled="!data?.write_ed"
-        <el-tooltip v-else effect="dark" :content="$t('chat.operation.pause')" placement="top">
+            @click="audioManage?.pause(true)"
-          <el-button type="primary" text :disabled="!data?.write_ed" @click="pausePlayAnswerText()">
+          >
            <AppIcon iconName="app-video-pause"></AppIcon>
          </el-button>
        </el-tooltip>
        <el-tooltip effect="dark" :content="$t('chat.operation.play')" placement="top" v-else>
          <el-button
            text
            :disabled="!data?.write_ed"
            @click="
              () => {
                bus.emit('play:pause', props.data.record_id)
                audioManage?.play(props.data.answer_text, true)
              }
            "
          >
            <AppIcon iconName="app-video-play"></AppIcon>
          </el-button>
        </el-tooltip>
        <el-divider direction="vertical" />
      </span>
      <span v-if="type == 'ai-chat' || type == 'log'">
@ -82,6 +97,7 @@
    </div>
    <!-- 先渲染，不然不能播放   -->
    <audio ref="audioPlayer" v-for="item in audioList" :key="item" controls hidden="hidden"></audio>
    <div ref="audioCiontainer"></div>
  </div>
 </template>
 <script setup lang="ts">
@ -91,8 +107,9 @@ import { copyClick } from '@/utils/clipboard'
 import applicationApi from '@/api/application'
 import { datetimeFormat } from '@/utils/time'
 import { MsgError } from '@/utils/message'
 import { t } from '@/locales'
 import bus from '@/bus'
 import { da } from 'element-plus/es/locale'
 const route = useRoute()
 const {
  params: { id }
@ -118,12 +135,12 @@ const props = withDefaults(
 const emit = defineEmits(['update:data', 'regeneration'])
 const audioPlayer = ref<HTMLAudioElement[] | null>([])
 const audioCiontainer = ref<HTMLDivElement>()
 const audioPlayerStatus = ref(false)
 const buttonData = ref(props.data)
 const loading = ref(false)
-const utterance = ref<SpeechSynthesisUtterance | null>(null)
+
 const audioList = ref<string[]>([])
 const currentAudioIndex = ref(0)
 function regeneration() {
  emit('regeneration')
@ -166,144 +183,331 @@ function markdownToPlainText(md: string) {
 function removeFormRander(text: string) {
  return text.replace(/<form_rander>[\s\S]*?<\/form_rander>/g, '').trim()
 }
-
+function getKey(keys: Array<number>, index: number) {
-const playAnswerText = (text: string) => {
+  // 从后往前查找第一个小于等于index的键
-  if (!text) {
+  for (let i = keys.length - 1; i >= 0; i--) {
-    text = t('chat.tip.answerMessage')
+    if (keys[i] <= index) {
      return keys[i]
    }
  }
-  // 移除表单渲染器
+  return 0
-  text = removeFormRander(text)
+}
-  // text 处理成纯文本
+function smartSplit(
-  text = markdownToPlainText(text)
+  str: string,
-  // console.log(text)
+  minLengthConfig: any = {
-  audioPlayerStatus.value = true
+    0: 10,
-  // 分割成多份
+    1: 25,
-  audioList.value = text.split(/(<audio[^>]*><\/audio>)/).filter((item) => item.trim().length > 0)
+    3: 50,
-  nextTick(() => {
+    5: 100
-    // console.log(audioList.value, audioPlayer.value)
+  },
-    playAnswerTextPart()
+  is_end = false
-  })
+) {
  // 匹配中文逗号/句号，且后面至少还有20个字符（含任何字符，包括换行）
  const regex = /([。？\n])|(<audio[^>]*><\/audio>)/g
  // 拆分并保留分隔符
  const parts = str.split(regex)
  const result = []
  const keys = Object.keys(minLengthConfig).map(Number)
  let minLength = minLengthConfig[0]
  let temp_str = ''
  for (let i = 0; i < parts.length; i++) {
    const content = parts[i]
    if (content == undefined) {
      continue
    }
    if (/^<audio[^>]*><\/audio>$/.test(content)) {
      if (temp_str.length > 0) {
        result.push(temp_str)
        temp_str = ''
      }
      result.push(content)
      continue
    }
    temp_str += parts[i]
    if (temp_str.length > minLength && /[。？\n]$/.test(temp_str)) {
      minLength = minLengthConfig[getKey(keys, i)]
      result.push(temp_str)
      temp_str = ''
    }
  }
  if (temp_str.length > 0 && is_end) {
    result.push(temp_str)
  }
  return result
 }
-const playAnswerTextPart = () => {
+enum AudioStatus {
-  // console.log(audioList.value, currentAudioIndex.value)
+  /**
-  if (currentAudioIndex.value === audioList.value.length) {
+   * 结束
-    audioPlayerStatus.value = false
+   */
-    currentAudioIndex.value = 0
+  END = 'END',
-    return
+  /**
   * 播放中
   */
  PLAY_INT = 'PLAY_INT',
  /**
   * 刚挂载
   */
  MOUNTED = 'MOUNTED',
  /**
   * 就绪
   */
  READY = 'READY',
  /**
   * 错误
   */
  ERROR = 'ERROR'
 }
 class AudioManage {
  textList: Array<string>
  statusList: Array<AudioStatus>
  audioList: Array<HTMLAudioElement | SpeechSynthesisUtterance>
  ttsType: string
  root: Element
  constructor(ttsType: string, root: HTMLDivElement) {
    this.textList = []
    this.audioList = []
    this.statusList = []
    this.ttsType = ttsType
    this.root = root
  }
-  if (audioList.value[currentAudioIndex.value].includes('<audio')) {
+  appendTextList(textList: Array<string>) {
-    if (audioPlayer.value) {
+    const newTextList = textList.slice(this.textList.length)
-      audioPlayer.value[currentAudioIndex.value].src =
+    // 没有新增段落
-        audioList.value[currentAudioIndex.value].match(/src="([^"]*)"/)?.[1] || ''
+    if (newTextList.length <= 0) {
-      audioPlayer.value[currentAudioIndex.value].play() // 自动播放音频
+      return
-      audioPlayer.value[currentAudioIndex.value].onended = () => {
+    }
-        currentAudioIndex.value += 1
+    newTextList.forEach((text, index) => {
-        playAnswerTextPart()
+      this.textList.push(text)
      this.statusList.push(AudioStatus.MOUNTED)
      index = this.textList.length - 1
      if (this.ttsType === 'TTS') {
        const audioElement: HTMLAudioElement = document.createElement('audio')
        audioElement.controls = true
        audioElement.hidden = true
        /**
         * 播放结束事件
         */
        audioElement.onended = () => {
          this.statusList[index] = AudioStatus.END
          // 如果所有的节点都播放结束
          if (this.statusList.every((item) => item === AudioStatus.END)) {
            this.statusList = this.statusList.map((item) => AudioStatus.READY)
          } else {
            // next
            this.play()
          }
        }
        this.root.appendChild(audioElement)
        if (/^<audio[^>]*><\/audio>$/.test(text)) {
          audioElement.src = text.match(/src="([^"]*)"/)?.[1] || ''
          this.statusList[index] = AudioStatus.READY
        } else {
          applicationApi
            .postTextToSpeech(
              (props.applicationId as string) || (id as string),
              { text: text },
              loading
            )
            .then(async (res: any) => {
              if (res.type === 'application/json') {
                const text = await res.text()
                MsgError(text)
                this.statusList[index] = AudioStatus.ERROR
                this.play()
                return
              }
              // 假设我们有一个 MP3 文件的字节数组
              // 创建 Blob 对象
              const blob = new Blob([res], { type: 'audio/mp3' })
              // 创建对象 URL
              const url = URL.createObjectURL(blob)
              audioElement.src = url
              this.statusList[index] = AudioStatus.READY
              this.play()
            })
            .catch((err) => {
              console.log('err: ', err)
              this.statusList[index] = AudioStatus.ERROR
              this.play()
            })
        }
        this.audioList.push(audioElement)
      } else {
        const speechSynthesisUtterance: SpeechSynthesisUtterance = new SpeechSynthesisUtterance(
          text
        )
        speechSynthesisUtterance.onpause = () => {
          console.log('onpause')
        }
        speechSynthesisUtterance.onend = () => {
          this.statusList[index] = AudioStatus.END
          // 如果所有的节点都播放结束
          if (this.statusList.every((item) => item === AudioStatus.END)) {
            this.statusList = this.statusList.map((item) => AudioStatus.READY)
          } else {
            // next
            this.play()
          }
        }
        speechSynthesisUtterance.onerror = (e) => {
          this.statusList[index] = AudioStatus.READY
        }
        this.statusList[index] = AudioStatus.READY
        this.audioList.push(speechSynthesisUtterance)
        this.play()
      }
    })
  }
  reTryError() {
    this.statusList.forEach((status, index) => {
      if (status === AudioStatus.ERROR) {
        const audioElement = this.audioList[index]
        if (audioElement instanceof HTMLAudioElement) {
          const text = this.textList[index]
          applicationApi
            .postTextToSpeech(
              (props.applicationId as string) || (id as string),
              { text: text },
              loading
            )
            .then(async (res: any) => {
              if (res.type === 'application/json') {
                const text = await res.text()
                MsgError(text)
                this.statusList[index] = AudioStatus.ERROR
                return
              }
              // 假设我们有一个 MP3 文件的字节数组
              // 创建 Blob 对象
              const blob = new Blob([res], { type: 'audio/mp3' })
              // 创建对象 URL
              const url = URL.createObjectURL(blob)
              audioElement.src = url
              this.statusList[index] = AudioStatus.READY
              this.play()
            })
            .catch((err) => {
              console.log('err: ', err)
              this.statusList[index] = AudioStatus.ERROR
            })
        }
      }
    })
  }
  isPlaying() {
    return this.statusList.some((item) => [AudioStatus.PLAY_INT].includes(item))
  }
  play(text?: string, is_end?: boolean) {
    if (text) {
      const textList = this.getTextList(text, is_end ? true : false)
      this.appendTextList(textList)
    }
    // 如果存在在阅读的元素则直接返回
    if (this.statusList.some((item) => [AudioStatus.PLAY_INT].includes(item))) {
      return
    }
    this.reTryError()
    // 需要播放的内容
    const index = this.statusList.findIndex((status) => [AudioStatus.READY].includes(status))
    if (index < 0 || this.statusList[index] === AudioStatus.MOUNTED) {
      return
    }
    const audioElement = this.audioList[index]
    if (audioElement instanceof SpeechSynthesisUtterance) {
      if (window.speechSynthesis.paused) {
        window.speechSynthesis.resume()
      } else {
        if (window.speechSynthesis.pending) {
          window.speechSynthesis.cancel()
        }
        speechSynthesis.speak(audioElement)
        this.statusList[index] = AudioStatus.PLAY_INT
      }
    } else {
      // 标签朗读
      try {
        audioElement.play()
        this.statusList[index] = AudioStatus.PLAY_INT
      } catch (e) {
        this.statusList[index] = AudioStatus.ERROR
      }
    }
-  } else if (props.tts_type === 'BROWSER') {
+  }
-    if (audioList.value[currentAudioIndex.value] !== utterance.value?.text) {
+  pause(self?: boolean) {
-      window.speechSynthesis.cancel()
+    const index = this.statusList.findIndex((status) => status === AudioStatus.PLAY_INT)
-    }
+    if (index < 0) {
    if (
      window.speechSynthesis.paused &&
      audioList.value[currentAudioIndex.value] === utterance.value?.text
    ) {
      window.speechSynthesis.resume()
      return
    }
-    // 创建一个新的 SpeechSynthesisUtterance 实例
+    const audioElement = this.audioList[index]
-    utterance.value = new SpeechSynthesisUtterance(audioList.value[currentAudioIndex.value])
+    if (audioElement instanceof SpeechSynthesisUtterance) {
-    utterance.value.onend = () => {
+      this.statusList[index] = AudioStatus.READY
-      utterance.value = null
+      if (self) {
-      currentAudioIndex.value += 1
+        window.speechSynthesis.pause()
-      playAnswerTextPart()
+        nextTick(() => {
-    }
+          if (!window.speechSynthesis.paused) {
-    utterance.value.onerror = () => {
+            window.speechSynthesis.cancel()
      audioPlayerStatus.value = false
      utterance.value = null
    }
    // 调用浏览器的朗读功能
    window.speechSynthesis.speak(utterance.value)
  } else if (props.tts_type === 'TTS') {
    // 恢复上次暂停的播放
    if (audioPlayer.value && audioPlayer.value[currentAudioIndex.value]?.src) {
      audioPlayer.value[currentAudioIndex.value].play()
      return
    }
    applicationApi
      .postTextToSpeech(
        (props.applicationId as string) || (id as string),
        { text: audioList.value[currentAudioIndex.value] },
        loading
      )
      .then(async (res: any) => {
        if (res.type === 'application/json') {
          const text = await res.text()
          MsgError(text)
          return
        }
        // 假设我们有一个 MP3 文件的字节数组
        // 创建 Blob 对象
        const blob = new Blob([res], { type: 'audio/mp3' })
        // 创建对象 URL
        const url = URL.createObjectURL(blob)
        // 测试blob是否能正常播放
        // const link = document.createElement('a')
        // link.href = window.URL.createObjectURL(blob)
        // link.download = "abc.mp3"
        // link.click()
        // 检查 audioPlayer 是否已经引用了 DOM 元素
        if (audioPlayer.value) {
          audioPlayer.value[currentAudioIndex.value].src = url
          audioPlayer.value[currentAudioIndex.value].play() // 自动播放音频
          audioPlayer.value[currentAudioIndex.value].onended = () => {
            currentAudioIndex.value += 1
            playAnswerTextPart()
          }
-        } else {
+        })
-          console.error('audioPlayer.value is not an instance of HTMLAudioElement')
+      } else {
-        }
+        window.speechSynthesis.cancel()
-      })
+      }
-      .catch((err) => {
+    } else {
-        console.log('err: ', err)
+      if (this.statusList[index] === AudioStatus.PLAY_INT) {
-      })
+        // 标签朗读
-  }
+        this.statusList[index] = AudioStatus.READY
-}
+        audioElement.pause()
-
+      }
 const pausePlayAnswerText = () => {
  audioPlayerStatus.value = false
  if (props.tts_type === 'TTS') {
    if (audioPlayer.value) {
      audioPlayer.value?.forEach((item) => {
        item.pause()
      })
    }
  }
-  if (props.tts_type === 'BROWSER') {
+  getTextList(text: string, is_end: boolean) {
-    window.speechSynthesis.pause()
+    // 移除表单渲染器
    text = removeFormRander(text)
    // text 处理成纯文本
    text = markdownToPlainText(text)
    const split = smartSplit(
      props.data.answer_text,
      {
        0: 20,
        1: 50,
        5: 100
      },
      is_end
    )
    return split
  }
 }
-
+const audioManage = ref<AudioManage>()
 onMounted(() => {
-  bus.on('pause-autoplay', () => {
+  if (audioCiontainer.value) {
-    pausePlayAnswerText()
+    audioManage.value = new AudioManage(props.tts_type, audioCiontainer.value)
    // console.log(1234)
  })
  bus.emit('pause-autoplay')
  // 第一次回答后自动播放， 打开历史记录不自动播放
  if (
    props.tts &&
    props.tts_autoplay &&
    buttonData.value.write_ed &&
    !buttonData.value.update_time
  ) {
    playAnswerText(buttonData.value.answer_text)
  }
  bus.on('play:pause', (record_id: string) => {
    if (record_id !== props.data.record_id) {
      if (audioManage.value) {
        audioManage.value?.pause()
      }
    }
  })
  bus.on('change:answer', (data: any) => {
    const record_id = data.record_id
    bus.emit('play:pause', record_id)
    if (props.data.record_id == record_id) {
      if (props.tts && props.tts_autoplay) {
        if (audioManage.value) {
          audioManage.value.play(props.data.answer_text, data.is_end)
        }
      }
    }
  })
 })
 </script>
 <style lang="scss" scoped>
--- a/ui/src/components/ai-chat/component/operation-button/index.vue
+++ b/ui/src/components/ai-chat/component/operation-button/index.vue
@ -26,7 +26,7 @@
    </div>
    <ChatOperationButton
-      v-if="chatRecord.write_ed && 500 != chatRecord.status"
+      v-show="chatRecord.write_ed && 500 != chatRecord.status"
      :tts="application.tts_model_enable"
      :tts_type="application.tts_type"
      :tts_autoplay="application.tts_autoplay"
--- a/ui/src/components/ai-chat/index.vue
+++ b/ui/src/components/ai-chat/index.vue
@ -490,6 +490,7 @@ const handleScroll = () => {
 }
 onMounted(() => {
  window.speechSynthesis.cancel()
  window.sendMessage = sendMessage
 })