Flowise/packages/components/src/speechToText.ts

import { ICommonObject, IFileUpload } from './Interface'
import { getCredentialData, getStoragePath } from './utils'
import { type ClientOptions, OpenAIClient } from '@langchain/openai'
import fs from 'fs'
import path from 'path'
import { AssemblyAI } from 'assemblyai'

export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
    if (speechToTextConfig) {
        const credentialId = speechToTextConfig.credentialId as string
        const credentialData = await getCredentialData(credentialId ?? '', options)
        const filePath = path.join(getStoragePath(), options.chatflowid, options.chatId, upload.name)

        const audio_file = fs.createReadStream(filePath)

        if (speechToTextConfig.name === 'openAIWhisper') {
            const openAIClientOptions: ClientOptions = {
                apiKey: credentialData.openAIApiKey
            }
            const openAIClient = new OpenAIClient(openAIClientOptions)

            const transcription = await openAIClient.audio.transcriptions.create({
                file: audio_file,
                model: 'whisper-1',
                language: speechToTextConfig?.language,
                temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
                prompt: speechToTextConfig?.prompt
            })
            if (transcription?.text) {
                return transcription.text
            }
        } else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
            const client = new AssemblyAI({
                apiKey: credentialData.assemblyAIApiKey
            })

            const params = {
                audio: audio_file,
                speaker_labels: false
            }

            const transcription = await client.transcripts.transcribe(params)
            if (transcription?.text) {
                return transcription.text
            }
        }
    } else {
        throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.')
    }
    return undefined
}