Gemini with audio
Video
Code
Mix.install([
{:req, "~> 0.4.14"},
{:kino, "~> 0.12.0"}
])
form =
Kino.Control.form(
[
prompt: Kino.Input.textarea("Prompt"),
audio: Kino.Input.audio("Audio", format: :wav)
],
submit: "Submit"
)
frame = Kino.Frame.new()
Kino.listen(form, fn %{data: %{prompt: prompt, audio: audio}} ->
Kino.Frame.clear(frame)
%{file_ref: file_ref} = audio
file_path = Kino.Input.file_path(file_ref)
Gemini.chat_streaming(prompt, file_path)
|> Stream.each(&Kino.Frame.append(frame, Kino.Text.new(&1)))
|> Stream.run()
end)
Kino.Layout.grid([form, frame])
defmodule Gemini do
def chat_streaming(prompt, file_path) do
pid = self()
file_contents = File.read!(file_path)
base64 = Base.encode64(file_contents)
gemini_api_key = System.get_env("LB_GEMINI_API_KEY")
Stream.resource(
fn ->
Task.async(fn ->
Req.post!(
"https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro-latest:streamGenerateContent?key=#{gemini_api_key}&alt=sse",
receive_timeout: :infinity,
headers: [
{"content-type", "application/json"}
],
json: %{
contents: [
%{
role: "user",
parts: [
%{
text: prompt
},
%{
inlineData: %{
mimeType: "audio/wav",
data: base64
}
}
]
}
]
},
into: fn {:data, data}, {req, resp} ->
chunks =
data
|> String.split("\n")
|> Enum.filter(fn line ->
String.starts_with?(line, "data: {")
end)
|> Enum.map(fn line ->
line
|> String.replace_prefix("data: ", "")
|> Jason.decode!()
|> extract_text()
end)
for chunk <- chunks do
send(pid, chunk)
end
{:cont, {req, resp}}
end
)
send(pid, :done)
end)
end,
fn task ->
receive do
:done ->
{:halt, task}
data ->
{[data], task}
after
15_000 ->
{:halt, task}
end
end,
fn task -> Task.await(task, 120_000) end
)
end
def extract_text(map) do
map["candidates"]
|> List.first()
|> get_in(["content", "parts"])
|> Enum.map(& &1["text"])
|> Enum.join()
end
end