From 4604594c559e815a3cfd324b3f525fa78defc8da Mon Sep 17 00:00:00 2001 From: vinodkiran Date: Tue, 30 Jan 2024 21:48:08 -0500 Subject: [PATCH] SpeechToText: Adding SpeechToText at the Chatflow level. --- .../credentials/AssemblyAI.credential.ts | 23 +++++++++ .../nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts | 36 +------------- .../ChatOpenAI/FlowiseChatOpenAI.ts | 21 +------- .../speechtotext/assemblyai/AssemblyAI.ts | 33 +++++++++++++ .../speechtotext/assemblyai/assemblyai.png | Bin 0 -> 8677 bytes packages/components/src/MultiModalUtils.ts | 43 ---------------- packages/server/src/NodesPool.ts | 2 +- packages/server/src/index.ts | 46 +++++++++++++++--- packages/server/src/utils/index.ts | 34 ++++++++++++- .../ui-component/dialog/SpeechToTextDialog.js | 10 ++-- 10 files changed, 136 insertions(+), 112 deletions(-) create mode 100644 packages/components/credentials/AssemblyAI.credential.ts create mode 100644 packages/components/nodes/speechtotext/assemblyai/AssemblyAI.ts create mode 100644 packages/components/nodes/speechtotext/assemblyai/assemblyai.png diff --git a/packages/components/credentials/AssemblyAI.credential.ts b/packages/components/credentials/AssemblyAI.credential.ts new file mode 100644 index 00000000..019cd7aa --- /dev/null +++ b/packages/components/credentials/AssemblyAI.credential.ts @@ -0,0 +1,23 @@ +import { INodeParams, INodeCredential } from '../src/Interface' + +class AssemblyAIApi implements INodeCredential { + label: string + name: string + version: number + inputs: INodeParams[] + + constructor() { + this.label = 'AssemblyAI API' + this.name = 'assemblyAIApi' + this.version = 1.0 + this.inputs = [ + { + label: 'AssemblyAI Api Key', + name: 'assemblyAIApiKey', + type: 'password' + } + ] + } +} + +module.exports = { credClass: AssemblyAIApi } diff --git a/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts b/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts index 9543f1ee..1cb09f3f 100644 --- a/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts +++ b/packages/components/nodes/chatmodels/ChatOpenAI/ChatOpenAI.ts @@ -162,36 +162,6 @@ class ChatOpenAI_ChatModels implements INode { default: false, optional: true }, - { - label: 'Allow Speech to Text', - name: 'allowSpeechToText', - type: 'boolean', - default: false, - optional: true - }, - // TODO: only show when speechToText is true - { - label: 'Speech to Text Method', - description: 'How to turn audio into text', - name: 'speechToTextMode', - type: 'options', - options: [ - { - label: 'Transcriptions', - name: 'transcriptions', - description: - 'Transcribe audio into whatever language the audio is in. Default method when Speech to Text is turned on.' - }, - { - label: 'Translations', - name: 'translations', - description: 'Translate and transcribe the audio into english.' - } - ], - optional: false, - default: 'transcriptions', - additionalParams: true - }, { label: 'Image Resolution', description: 'This parameter controls the resolution in which the model views the image.', @@ -231,8 +201,6 @@ class ChatOpenAI_ChatModels implements INode { const baseOptions = nodeData.inputs?.baseOptions const allowImageUploads = nodeData.inputs?.allowImageUploads as boolean - const allowSpeechToText = nodeData.inputs?.allowSpeechToText as boolean - const speechToTextMode = nodeData.inputs?.speechToTextMode as string const imageResolution = nodeData.inputs?.imageResolution as string const credentialData = await getCredentialData(nodeData.credential ?? '', options) @@ -270,9 +238,7 @@ class ChatOpenAI_ChatModels implements INode { const multiModal = { allowImageUploads: allowImageUploads ?? false, - allowSpeechToText: allowSpeechToText ?? false, - imageResolution, - speechToTextMode + imageResolution } model.multiModal = multiModal return model diff --git a/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts b/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts index 8af9c4df..1bf4a286 100644 --- a/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts +++ b/packages/components/nodes/chatmodels/ChatOpenAI/FlowiseChatOpenAI.ts @@ -7,8 +7,7 @@ import { ChatOpenAICallOptions } from '@langchain/openai/dist/chat_models' import { BaseMessageChunk, BaseMessageLike, HumanMessage, LLMResult } from 'langchain/schema' import { Callbacks } from '@langchain/core/callbacks/manager' import { ICommonObject, INodeData } from '../../../src' -import { addImagesToMessages, checkSpeechToText } from '../../../src/MultiModalUtils' -import { ChatPromptTemplate, PromptTemplate } from 'langchain/prompts' +import { addImagesToMessages } from '../../../src/MultiModalUtils' export class FlowiseChatOpenAI extends ChatOpenAI { multiModal: {} @@ -38,24 +37,6 @@ export class FlowiseChatOpenAI extends ChatOpenAI { private async injectMultiModalMessages(messages: BaseMessageLike[][]) { const nodeData = FlowiseChatOpenAI.chainNodeData const optionsData = FlowiseChatOpenAI.chainNodeOptions - let audioTrans = await checkSpeechToText(nodeData, optionsData) - if (audioTrans) { - if (messages.length > 0) { - const lastMessage = messages[0].pop() as HumanMessage - if (!nodeData.inputs?.prompt) { - lastMessage.content = audioTrans - } else if (nodeData.inputs?.prompt instanceof ChatPromptTemplate) { - lastMessage.content = audioTrans - } else if (nodeData.inputs?.prompt instanceof PromptTemplate) { - let prompt = nodeData.inputs?.prompt as PromptTemplate - let inputVar = prompt.inputVariables[0] - let formattedValues: any = {} - formattedValues[inputVar] = audioTrans - lastMessage.content = await prompt.format(formattedValues) - } - messages[0].push(lastMessage) - } - } const messageContent = addImagesToMessages(nodeData, optionsData) if (messageContent) { if (messages[0].length > 0 && messages[0][messages[0].length - 1] instanceof HumanMessage) { diff --git a/packages/components/nodes/speechtotext/assemblyai/AssemblyAI.ts b/packages/components/nodes/speechtotext/assemblyai/AssemblyAI.ts new file mode 100644 index 00000000..c5db6619 --- /dev/null +++ b/packages/components/nodes/speechtotext/assemblyai/AssemblyAI.ts @@ -0,0 +1,33 @@ +import { INode, INodeParams } from '../../../src/Interface' + +class AssemblyAI_SpeechToText implements INode { + label: string + name: string + version: number + description: string + type: string + icon: string + category: string + baseClasses: string[] + inputs?: INodeParams[] + credential: INodeParams + + constructor() { + this.label = 'AssemblyAI' + this.name = 'assemblyAI' + this.version = 1.0 + this.type = 'AssemblyAI' + this.icon = 'assemblyai.png' + this.category = 'SpeechToText' + this.baseClasses = [this.type] + this.inputs = [] + this.credential = { + label: 'Connect Credential', + name: 'credential', + type: 'credential', + credentialNames: ['assemblyAIApi'] + } + } +} + +module.exports = { nodeClass: AssemblyAI_SpeechToText } diff --git a/packages/components/nodes/speechtotext/assemblyai/assemblyai.png b/packages/components/nodes/speechtotext/assemblyai/assemblyai.png new file mode 100644 index 0000000000000000000000000000000000000000..8919cb18b9c68087fae8cdba5722d44629ab49e1 GIT binary patch literal 8677 zcmV5gcHx>W@019+cPE!C%8;-4V+}ppo+sgS9001FKNklhX~w6IoVB?7pFa3O{$RVFaXcxm+%n%jI&pTrQW(<#M@PE|<$y zv)Tjd8rR>bYumm*pTAi@d6pIC!Rs;NM`cod)FIk&!7PimPrs5sepz8 z!Ez*T;y8gtLV*tx2N&g!j09f~IysWpaIiVJ@FL8islXf{B8ROn&ys?=Gvh#da7D2N zEIi=~U4R^0SqP$BR@caSH*(@k!QQbL$iP@MEk3HEW;m|TI50UT&J0Zc1ZEUM0dPAMS=i{ zuGvbkZgz(nm> zRqZWh<#zAkLs`XScs53iM|>a>m%{t%A#(} zFqUB}Rv($asn@>I4Xh@wY#! z)in*mQ+UKJ!Y$S_%}8(}dFL&ihjsIArn^5@HI3Y?w5Fg4$i7N^7gL1bkQC-fkh%I< z1ZNOaRCuMbd#tcUhPEA&O7(Y)LoDk;PfQ|^NO#z0!VEz0VAcM3*u|GLh2lLHe;9YY zR)|O*eht4EWPF9lFWL85oB5&50RnlYyxtvBeXQRNQl@eZ=7z=7U}9j|ND%V-6V>yU zOacK#DoZW?+Jv~SdaMA8)c!NA&-jieJKL>aylX8G@D2hZTK00X@)igZWCo$p87DS|POA4qxF#OSgR z?8+UXW{Z5{MEIsA`(?^T0URWItVb#}Uud;)uSOZ|k_zU=X@rDBa)C3d?tF*1Ura=t z(`A>X&maH+5}!uunvq)*bT4x9=)NB#{ef55g{4?N8gcs$F zr5dr0A+1qs=iA038dII*o+VOUD}m4?(nr;u?+}k_ME9%qEbAT%Hqj2L+al60&~zBYQ0) z&yD=3?J4);FpcqUlV2Rw;S#_oCrHK-sNPurjDMlzvP%MC{-PHQz=EfIQLi}G+Y|xg z5q}^^ZtI&Y#K=an+ixLMtwjc)IPQ>nO!7Uwd`{PIA+zeWNQV$O?hqdIt&`op$=ZIu z<)zNf_a8ZDwA?lbL{UpYQEJTujr#xpTE|2_*{6pv!(wfI=EI+sY`lU=C}y6^>B zf2Yvjg;<4jocWI59AfHDQvt{AasL>oH*$+^PS)UhNt~Q z)J+fwnHKd$@7uSw+I#IX^PROqn3dNQX*dY+afOB#_Wlifbe8$v1Y(E~Sti&HJcC&k zk1QQ|$XYKn`pkR-G$6t}2gB$Cfhb>huBY!Kja^}nwm#l}hfz@}(n!QZTnAw@vZG+K z=&j+;YwF!fgq#-|%riL9l;a@yx^qtrW{20)`zvVe0=W%GYZV@EtL;bxipD|kb=S-N zZ?g56`A(13N_aaHW!TVy0{f49KOB=kK5I=JG3ihubBrj&YQ)s!<-1Vv9E5nwE5y5F zfQ_xy%y-&R(CPgMJx#d>KaN0b>>h%r`=_nDwe^tHv3GEgG^|9-bz9>gd5MVgyxP(4 zeRl<|F@ZoPRwHOYqKdjv#qxsG*FBb1vQuyQi|a`s4^P(`QEuT*BUHl0kHM`js)mR3 zX6x}0pKw7gbZAED!>eL>sjqujPSZ$ZX9cYZ2LZ`ZB}KyEJ;NjXzCsp#JERcC)}TaY zr4U&{VQQ<7ya&jPgK&M_qayDl*P*Qs(7@N32ruDj*_1d4yTgo!z@xvKZ-_!6!r%)V z7%B5+C31ko-5d5Kkuef1RsxFA2_p?rg-bTHo3m<39+~pu)?(%x;$hfjiz&@cZo%?V zmik&6_LM_E1+7s)fC35;>F-hF?<0_0yqo1Jd#|ic3R+V%ft?t4P)#`q0?Ed3XzTD$ zts{xdzpPhUHMYfQ6cWmWnOm>Oullm};fMw^++93)rk)9>dCLpc%gaarg85i+sr{Pi@@76yI8=&-2^PQmKn~0xnnOz71 zdEVCTzVPwv$JXZ^hqrLqmp(l#y)bMcOXpb_$h8Yw9})|2_-^7Pz|!Ke^kRwR>%E10 z^{oYogt&r((Iu1rlMC@|=IeXNwU?SNWvm1yqY?Frp5ZBhaEx)`VH85Ne~2+85WvWU z=3--fa>*>aA5b!q_G`W)>s6X$*Q^|saCtJ`%@GF<0jZ;l9b~@Kx|0wrm^~eQKF?>q zcJ=Be^PM9oKvNO(<>`Qhwe!LZYwxyKxAkG&d908*9awsuL|lzro0mvfv2^7mC*zEd z{XpSg6Y1iz$oPYXb`ls>3Zy*qRd>O(nO5_e@AO!!tdr!UCg=5QzDPyVD2?rvGv6Bq zG=h_$kGxLma$Z~W1;xR4aa*tXPHR+6c*uG(=RIlbIJrLBTkIv&RZ`Ldv8*F zq}_AAAb^Nky$(&p;;0e`nJ)?fiMU49cFh-o7#_F=aF{p(A?H1B>x(pzvlW!>C2c*d z$O9^(DGe|=6v?VPQXOy34QSL}q{)I-Py!L_*WUSBv7R#j4`0jYS$B;znE7Jq^oc@_h{Zvjw_YMp zxD6y)XLjq`OMP)YzyTcUwF~Cx1vziMt)md@@XOw}RnQuxo$C^c`>Xexw)FbUcM(l< zXsn<$*%XV>?oA4qD1p>xzJxt{(q7H?A9rWFr8W)(QLHhLILtY)Vc!2$mq?@BbT^G` z@P|Z8STeh3e^7f>C7BR1uOdY!Er!Rfp9@{|KnLX_v4*#@_3muTk^vqCJ^al3`j;4= za*_X9OTN8TLVG))kcoX#j`O*j^GzV7#(SOl1|JY+NI>VMuCD-6{uOeq2}{?T{8%Y~ z$PyyJOR150XR~$RMT9DajYu=LemZ)=Z{JCoH?@&?FlE5vkGlnGpa@o$kfHqptuBKhAVHmp zg{9Ao2#!uUf6L3zbn@+y01++>lxyNLbDeifR8TtDy+|6He=HKtc)F71QV}5jneX82 z*jr7$H$a3W0+FXO@Ip&LCw7GpVi;-sK#N=jA^GkpRQlaQbx=8rJDarH{R^u0=SxBZ zLJ(nsw=bD_v2}k>$o#J=ZQA-VQ^9BmVO}ndvF0!1Aoud`DBeUyWSqtJ^kzlNn_>lAXtj*p;8Yly^SETsuu9i_m`Hh_?PVFW3h zHEU=q`R>ud;PE{Z(#YFY0kLPkz>!{tkp}<9b@VWd;3;<9Vq7KjQU~|9WA8B1;F<6K za;tNExvJP3bG|}fAoE&Vx1u~`WN66NkNLBAlyDX(f6sW_En7iIq5Z>T-`0|Em-v7X z)08-BPtTD0f|d>+mZM{!~zzHvx4{r1=nRnp|9w8nh@zQ{; zhkrhOVIv+dFk<@(nHNMHA-vytpoXgd_)dSxg_omIx!FzpO6c1&Utabbzs%xIy|x}6 z>A)a8#O?|ivGk?uP)@!6^TVOmP)_SVIO&BlUCVLvN9|FU-n`7pd3x)pGK@N`I33_M1jFO zgdfoCJWi3EJ zq|68gqD~oMq!vWTyjm7C5h2W)iqvFfK&-9%-)Z#riMac)xvh7O97+-?0%RCqR_QoQWxdP`A}7a2k{VvxqM=iqZifh#?n|OQTL( z)lYw=N>eXy*`3I+UGfEx-BtxKSrd0`-9rRbJU)!uZ))qoQ3DbbCEVXnTPs^S z$=7FOT-r*N-r6wv?h_!tM^JLM6E5_10Ra)uM}GOw+i6mvEfFE}mZh-ADVC6R0Ri;s zyF%wzGczQv%^yR(plwp+V= zM1DPR5u*iCY`HlM(LIm@Ns78Ko+|((eTgwKa(YMy7l)Bd$|knn=hbRr@;$qIl+Cud zU1^nKNX?@p^^I)(8;Us4MvG#r-)&Y0VwS^5l4G=+Z15;|fBB7{Q&S{4OJ~_CEOMZA z@)^lfGA=XpTx%lkpZPkB_(tz@vT8;NfAxqHG>LeW?4+{H%b9P%(D8Ar0nS_M0WI_|cv;GDr1p6Y_e zmdUphT9CJBOwdJD4d=Wj`I5Ebotv`!%mpVaBGP9-BqtZ0d7&mwYt6i=I#wyfD8fSK z+A#SBse>g2LZr{9H)^zW&U|x&5?2gAbEWIz$rno(gv5*0MH1rgQYy&2AVL)|(VxEk z*8GaeH|h{eXks!Kn-VyQJ+o33AnAL)qB*Kq_aJkLHx^I60Kz7O33pZj$qB4fl6gtK z^7BU(pAs%WvTpJ{LP(z370d9#H8!xRA@imR#4MuLXk931S;f{nB!G&Hh(rfP*r{kn z>d3qx0wwX{ySg@XJcJ#}qKJ@rO`(bOb*Tk+^3>~Cg{6ycXt8UD5=Ru5khKsAtW!vU zCVq=u;Zc*N=SR7Q5T;J)CsqXGg_AFq?h|6pR3O7~nmRIXs_w*IBu`LgoG$EA)=9ou z0=iJaYlN-T(@+^8yIua&^9UhD8SoLNPV!v=kgN*oC`!~jfKyB6{nyrU5adugjb&wV zTkn++>=6Yenifo6uVuAl-c-RZRwvXLi3yIfysZZx5VR55*Pe8VJX}xa{YK=38azQP z>`KtU)?ryhdaZ?=0t9a;QuNNTB1`AY7cr2;VWhM}S=QD8#9z61!*XIj;I=Y|+lyEhw)RuWub)1_3NE#U?)Ix}GjCFMwaeN+57-)56-c%>SN9?>9 zxGB|7@tpZ8WQm|z%PyZMXEd!0lH?v6HSR$J!Hwyqe# z;8BKw-N|~k4j@Smzoe1eC8Xxeo9b{H&}3j@wLm9Z%O+ne{W^gX7A~}bmr4Oi$(QDB z5-k<*vkoFh2pOjZQ@tapHuDNwSBl6Ps&HM`);qA6N(r0@rK+^_n0(Pfwu}KYEFXnn zWzjR=K7Ib*n2ewV8_|-)R?T-C%E=cp@OMiH(yW3=xG4Y<*U6M{tsam+$=6|o0kc=4 zFkz36tPWs;j@aH`F`*m^b*qX_2Cg*6IR7PfWPt0W7Qppjpek6JTtsslQh zCJbJ4f?6;_@|`vFURkjOAxJSslj<`sh-5}AC(5@KG0a)J=11hnf|X{_;j7^*Q#l~J zAf#|e0TM=NVJ%z2*1IcWuu3rt14)X$XE9AgQv_8yT zjT*4@zh}Ou0wHMOp0Qvz-JF0R!oRwx1k5(kvy?V6uS&kS$=x$9?#1~deSze=k9?#~ zgA!xz>E&xu)$otCb1VXmnH{H!mOE>rc_}N3mBR-Xz8)c9>j0rqU@aaK4#3k zsZawwPy)RDzzNEr1dVOHeP(iPAoHd=Xh6tZTqh87`Y{V2=zxjXflr?qTA)`OnOE34fC&HK zumn;{-;M}#WM0@1B@-}v6T^l+q;Degrph8f2ya9Y=;PYQ%$XM+P{VV0;T~-3o3V7x zd>ukiWI_uwWM0Z)1X*+=4g*XZnK#?ItN~6;?i$C+$BdS4cZNm^leU=WOG|)!pZUfY z9cTt_$F{@EmhLzRix{J212@T+W-_lv#6~Tggjdm7UOw|xkrxG6uO6RsDQ9WnoYy8_ z?ctyc(L+|kQkhq>K!2pnwJA%_wmt|= zOzvVt3BwBG$7`AQYrKP?Q0n@urM1jU@)b4M8$pP*8M(VWykP0*;fewkD1ipN*I?!a z5#QA_CzcQ{f|-|+Z|G8SUsBId58b6)W%_m$Fmou0S#4jGCC7K~R2 zNxi+ys}XTo@lUk!zb3Z#;pxnq5P_3%g(m3HWadrP6~u50sTkt5dy3R3M3SMrZdp>e zq~6>ymauidGy|}LX3}z=Oul>Q`1@k;g|UP*4hV=iRbT|gn0L<|VJLhV{EZ2dJ>(5RaV$gMOZ z%+RN-O8b_clJDo~)?Zk$R2`;gUYg z0WsqEdQGzRF6xm!c$$zw%uCg{rT=T|pR-94CS~4lJRC|49hQ^#$-E@rPo!YB1|lJ; za8C=7wS*+^4G52NKj-4)FbR;%2Hn@@(JuZyEIlsc5Hb@d-6TLVBX{(`M9?Lt;rF!k zScfEmV&ro{M993<6-3a*-J8J4+X3U(yEl4z%>T z#qBb--hD|zNL&sj3<&Y{W|p3k?`Ldzh%aVOuoA{wIwE+93rds_f}gLq$h;r|4BA3? zduBueT8LTJ1WHunf=W@BHv#09d<`OC0Wq-RX@hW+01*a~E*>ZPfeE*7W9k2sYX-ZG z!!Qty?S%^i%|)pH|5vv)S9l;9N^-ZCQX?moob=+s^NiKj=4(R4)4~K}?A!BBtSdM4 z9B;au;L({Q(l6~bGm1aJf{Km|{Yc{(qM;D8Ddrj3c$IvVLQ zCQRczAyIz!pKh&I?2s8l1&W(Mg_&)=&Rg1caOvf(LrOA0YV$P*Gp@WL#DqS}eIU}y zASBom=SikWm*xu@))*OpVKRh?>SaqC5V51Qjerq(QF!%8vyEl`+kx>iQv5c3s}~<_ zK)fA&TzK=O{NjICY;9-->481wTiOl{&D!a_Vn=DE2#`e2@F_d5NZpQ@svt4QsG z9d63PmtH7}nolV;J{ST_KSyu{VlwUXs^)tUA_pTjBKQvD2mnz=(6+|wui1Od*Lg*o z@gnABa0Jcwtz5zrF^1KUGrTf(-b-t(RU9Mtu&{*%d%lZ!!N7|+Pd;K>68-76iO4u7 zEI1m7{3)mKA@6v>;+B#|`j?gq3wZ^s+#DbRhqz$}D$P3;@sYRl*5=E##FYOS`|C)@ z4U;8UGm<7B$pNX&7e+t@UhWaIyo~g-RJkk>mErD{w)58J%Qe@ie3O>Ld5#k?C79Aw z>ds573m34$3p&Pst~SyI5EwC(V2Y?TIZ51k|LLimuuy#CzP0&Cj~MZtt7}K@NOy?H zOux9DjS;fjNEa_cMi`>f;Uwceua0#CfLV?@fq6?D>1VY9UwoR>_c;Oiign1~2TS|n z4i~Vr`8yUSXd|2(1U#I~@=)jyAxvZ_GMM+XWL0-806BSyf;4GyQc`zb5g7s)ub9Cy z#fT?nF?TFevj78F`kYMBc|{vm00_*O+T}5F{dlu?tP>t=v@`8-BA+DhyqDIeg2LmV zWH90z{M_9gOY=d1_%vZ?PRcBiE{N#+4uDu@eE;(EPp#^y5Klk?cj_k{PU`bx`bd{p zM;Z_Z4JTN5dbnrd1}3Ov*L=jBlnJ)6(C!;9fLKJ#2^Re7!9Er`A%T^ntTA^3SaZ)?h1f7>ycVwt@ zePS?*5P2(JYJX8ZFLp-yecsp2_q|hc@yD&v##4yZ={ax;e43;4ijKHVjf-E|$#jHH z4whX?CfLS8OJD`D!OXIPI76nl#l5S`$rRgIXdfrQo5fMJU`TM0ZKj`%G?}aOg2rdG{hKia;B9h6Gb}BcU12Uc-}E&C_|XB_L^d!w~Gkk za^E22t@LG0@SMyEh&10H5r&jeaf8AOqdE3@b*%rOjyr}#d-5dyY@PRS`(~8lFbqRM z%Z_7wfZqSA3u}sq$w;zukdF*cPmbHczD&q`R#Xpx1Co*~5OOImuW&o>AiE3M z?y8ugq6d_Cm43eYn$IB?^x`+(&O6BKg&Z$-- { - const MODEL_NAME = 'whisper-1' - let input = undefined - let model = nodeData.inputs?.model as BaseChatModel - if (model instanceof ChatOpenAI && (model as any).multiModal) { - const multiModalConfig = (model as any).multiModal - if (options?.uploads) { - if (options.uploads.length === 1 && options.uploads[0].mime === 'audio/webm') { - const upload = options.uploads[0] - //special case, text input is empty, but we have an upload (recorded audio) - if (multiModalConfig.allowSpeechToText) { - const openAIClientOptions: ClientOptions = { - apiKey: model.openAIApiKey, - organization: model.organization - } - const openAIClient = new OpenAIClient(openAIClientOptions) - const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name) - - // as the image is stored in the server, read the file and convert it to base64 - const audio_file = fs.createReadStream(filePath) - - if (multiModalConfig.speechToTextMode === 'transcriptions') { - const transcription = await openAIClient.audio.transcriptions.create({ - file: audio_file, - model: MODEL_NAME - }) - return transcription.text - } else if (multiModalConfig.speechToTextMode === 'translations') { - const translation = await openAIClient.audio.translations.create({ - file: audio_file, - model: MODEL_NAME - }) - return translation.text - } - } else { - throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.') - } - } - } - } - return input -} - export const addImagesToMessages = (nodeData: INodeData, options: ICommonObject): MessageContent => { const imageContent: MessageContent = [] let model = nodeData.inputs?.model as BaseChatModel diff --git a/packages/server/src/NodesPool.ts b/packages/server/src/NodesPool.ts index f4681d4a..8b01e63a 100644 --- a/packages/server/src/NodesPool.ts +++ b/packages/server/src/NodesPool.ts @@ -54,7 +54,7 @@ export class NodesPool { } } - const skipCategories = ['Analytic'] + const skipCategories = ['Analytic', 'SpeechToText'] if (!skipCategories.includes(newNodeInstance.category)) { this.componentNodes[newNodeInstance.name] = newNodeInstance } diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index e7816311..7558c689 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -46,7 +46,8 @@ import { getSessionChatHistory, getAllConnectedNodes, clearSessionMemory, - findMemoryNode + findMemoryNode, + convertedSpeechToText } from './utils' import { cloneDeep, omit, uniqWith, isEqual } from 'lodash' import { getDataSource } from './DataSource' @@ -58,7 +59,7 @@ import { Tool } from './database/entities/Tool' import { Assistant } from './database/entities/Assistant' import { ChatflowPool } from './ChatflowPool' import { CachePool } from './CachePool' -import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters } from 'flowise-components' +import { ICommonObject, IMessage, INodeOptionsValue, INodeParams, handleEscapeCharacters, IFileUpload } from 'flowise-components' import { createRateLimiter, getRateLimiter, initializeRateLimiter } from './utils/rateLimit' import { addAPIKey, compareKeys, deleteAPIKey, getApiKey, getAPIKeys, updateAPIKey } from './utils/apiKey' import { sanitizeMiddleware } from './utils/XSS' @@ -473,6 +474,17 @@ export class App { const flowObj = JSON.parse(chatflow.flowData) const allowances: IUploadFileSizeAndTypes[] = [] let allowSpeechToText = false + if (chatflow.speechToText) { + const speechToTextProviders = JSON.parse(chatflow.speechToText) + for (const provider in speechToTextProviders) { + const providerObj = speechToTextProviders[provider] + if (providerObj.status) { + allowSpeechToText = true + break + } + } + } + let allowImageUploads = false flowObj.nodes.forEach((node: IReactFlowNode) => { if (uploadAllowedCategoryNodes.indexOf(node.data.category) > -1) { @@ -488,9 +500,6 @@ export class App { }) allowImageUploads = true } - if (param.name === 'allowSpeechToText' && node.data.inputs?.['allowSpeechToText']) { - allowSpeechToText = true - } }) } }) @@ -1602,7 +1611,8 @@ export class App { if (incomingInput.uploads) { // @ts-ignore - ;(incomingInput.uploads as any[]).forEach((upload: any) => { + const uploads = incomingInput.uploads as IFileUpload[] + for (const upload of uploads) { if (upload.type === 'file' || upload.type === 'audio') { const filename = upload.name const dir = path.join(getUserHome(), '.flowise', 'gptvision', chatId) @@ -1618,7 +1628,29 @@ export class App { upload.data = chatId upload.type = 'stored-file' } - }) + + if (upload.mime === 'audio/webm' && incomingInput.uploads?.length === 1) { + //speechToText + let speechToTextConfig: any = {} + if (chatflow.speechToText) { + const speechToTextProviders = JSON.parse(chatflow.speechToText) + for (const provider in speechToTextProviders) { + const providerObj = speechToTextProviders[provider] + if (providerObj.status) { + speechToTextConfig = providerObj + speechToTextConfig['name'] = provider + break + } + } + } + if (speechToTextConfig) { + const speechToTextResult = await convertedSpeechToText(upload.data, speechToTextConfig) + if (speechToTextResult) { + incomingInput.question = speechToTextResult + } + } + } + } } let isStreamValid = false diff --git a/packages/server/src/utils/index.ts b/packages/server/src/utils/index.ts index dafe612c..92f4d450 100644 --- a/packages/server/src/utils/index.ts +++ b/packages/server/src/utils/index.ts @@ -593,7 +593,6 @@ export const resolveVariables = ( } const paramsObj = flowNodeData[types] ?? {} - getParamValues(paramsObj) return flowNodeData @@ -1079,3 +1078,36 @@ export const getAllValuesFromJson = (obj: any): any[] => { extractValues(obj) return values } + +export const convertedSpeechToText = async (upload: any, speechToTextConfig: any) => { + // const MODEL_NAME = 'whisper-1' + if (speechToTextConfig) { + //special case, text input is empty, but we have an upload (recorded audio) + // const openAIClientOptions: ClientOptions = { + // apiKey: model.openAIApiKey, + // organization: model.organization + // } + // const openAIClient = new OpenAIClient(openAIClientOptions) + // const filePath = path.join(getUserHome(), '.flowise', 'gptvision', upload.data, upload.name) + // + // // as the image is stored in the server, read the file and convert it to base64 + // const audio_file = fs.createReadStream(filePath) + // + // if (multiModalConfig.speechToTextMode === 'transcriptions') { + // const transcription = await openAIClient.audio.transcriptions.create({ + // file: audio_file, + // model: MODEL_NAME + // }) + // return transcription.text + // } else if (multiModalConfig.speechToTextMode === 'translations') { + // const translation = await openAIClient.audio.translations.create({ + // file: audio_file, + // model: MODEL_NAME + // }) + // return translation.text + // } + } else { + throw new Error('Speech to text is not selected, but found a recorded audio file. Please fix the chain.') + } + return undefined +} diff --git a/packages/ui/src/ui-component/dialog/SpeechToTextDialog.js b/packages/ui/src/ui-component/dialog/SpeechToTextDialog.js index fa2b7a78..10b6f076 100644 --- a/packages/ui/src/ui-component/dialog/SpeechToTextDialog.js +++ b/packages/ui/src/ui-component/dialog/SpeechToTextDialog.js @@ -41,8 +41,8 @@ import chatflowsApi from 'api/chatflows' const speechToTextProviders = [ { - label: 'OpenAI Wisper', - name: 'openAIWisper', + label: 'OpenAI Whisper', + name: 'openAIWhisper', icon: openAISVG, url: 'https://platform.openai.com/docs/guides/speech-to-text', inputs: [ @@ -70,7 +70,7 @@ const speechToTextProviders = [ label: 'Connect Credential', name: 'credential', type: 'credential', - credentialNames: ['assemblyAiApi'] + credentialNames: ['assemblyAIApi'] }, { label: 'On/Off', @@ -101,7 +101,7 @@ const SpeechToTextDialog = ({ show, dialogProps, onCancel }) => { }) if (saveResp.data) { enqueueSnackbar({ - message: 'Analytic Configuration Saved', + message: 'Speech To Text Configuration Saved', options: { key: new Date().getTime() + Math.random(), variant: 'success', @@ -118,7 +118,7 @@ const SpeechToTextDialog = ({ show, dialogProps, onCancel }) => { } catch (error) { const errorData = error.response.data || `${error.response.status}: ${error.response.statusText}` enqueueSnackbar({ - message: `Failed to save Analytic Configuration: ${errorData}`, + message: `Failed to save Speech To Text Configuration: ${errorData}`, options: { key: new Date().getTime() + Math.random(), variant: 'error',