Skip to content

Add whisper command #814

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Aug 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ auto-editor
build
ffmpeg_sources
.vscode
*.bin
tests/*
!tests/*.nim
!tests/*.py
Expand Down
73 changes: 52 additions & 21 deletions ae.nimble
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,20 @@ requires "checksums"
import std/os
import std/[strutils, strformat]

var disableHevc = (if getEnv("DISABLE_HEVC").len > 0: "-d:disable_hevc" else: "")
var disableHevc = getEnv("DISABLE_HEVC").len > 0
var enableWhisper = defined(macosx)
var flags = ""

if not disableHevc:
flags &= "-d:enable_hevc "
if enableWhisper:
flags &= "-d:enable_whisper "

task test, "Test the project":
exec &"nim c {disableHevc} -r tests/rationals"
exec &"nim c {flags} -r tests/rationals"

task make, "Export the project":
exec &"nim c -d:danger {disableHevc} --out:auto-editor src/main.nim"
exec &"nim c -d:danger {flags} --out:auto-editor src/main.nim"
when defined(macosx):
exec "strip -ur auto-editor"
exec "stat -f \"%z bytes\" ./auto-editor"
Expand Down Expand Up @@ -102,6 +109,14 @@ let svtav1 = Package(
buildArguments: @["-DBUILD_APPS=OFF", "-DBUILD_DEC=OFF", "-DBUILD_ENC=ON", "-DENABLE_NASM=ON"],
ffFlag: "--enable-libsvtav1",
)
let whisper = Package(
name: "whisper",
sourceUrl: "https://github.com/ggml-org/whisper.cpp/archive/refs/tags/v1.7.6.tar.gz",
sha256: "166140e9a6d8a36f787a2bd77f8f44dd64874f12dd8359ff7c1f4f9acb86202e",
buildSystem: "cmake",
buildArguments: @["-DWHISPER_BUILD_TESTS=OFF", "-DWHISPER_BUILD_SERVER=OFF"],
ffFlag: "--enable-whisper",
)
let x264 = Package(
name: "x264",
sourceUrl: "https://code.videolan.org/videolan/x264/-/archive/32c3b801191522961102d4bea292cdb61068d0dd/x264-32c3b801191522961102d4bea292cdb61068d0dd.tar.bz2",
Expand All @@ -124,8 +139,10 @@ let ffmpeg = Package(
var packages: seq[Package] = @[]
if not defined(macosx):
packages.add nvheaders
if enableWhisper:
packages.add whisper
packages &= [lame, opus, vpx, dav1d, svtav1, x264]
if disableHevc.len == 0:
if not disableHevc:
packages.add x265


Expand All @@ -134,6 +151,8 @@ func location(package: Package): string = # tar location
"v1.15.2.tar.gz"
elif package.name == "nv-codec-headers":
"n13.0.19.0.tar.gz"
elif package.name == "whisper":
"v1.7.6.tar.gz"
else:
package.sourceUrl.split("/")[^1]

Expand All @@ -142,6 +161,8 @@ func dirName(package: Package): string =
return "libvpx-1.15.2"
if package.name == "nv-codec-headers":
return "nv-codec-headers-n13.0.19.0"
if package.name == "whisper":
return "whisper.cpp-1.7.6"

var name = package.location
for ext in [".tar.gz", ".tar.xz", ".tar.bz2", ".orig"]:
Expand Down Expand Up @@ -192,8 +213,8 @@ proc cmakeBuild(package: Package, buildPath: string, crossWindows: bool = false)

if crossWindows:
cmakeArgs.add("-DCMAKE_SYSTEM_NAME=Windows")
cmakeArgs.add("-DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc")
cmakeArgs.add("-DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++")
cmakeArgs.add("-DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc-posix")
cmakeArgs.add("-DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++-posix")
cmakeArgs.add("-DCMAKE_RC_COMPILER=x86_64-w64-mingw32-windres")
cmakeArgs.add("-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER")
cmakeArgs.add("-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY")
Expand Down Expand Up @@ -248,8 +269,8 @@ proc x265Build(buildPath: string, crossWindows: bool = false) =
# Add cross-compilation flags if needed
if crossWindows:
cmakeArgs.add("-DCMAKE_SYSTEM_NAME=Windows")
cmakeArgs.add("-DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc")
cmakeArgs.add("-DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++")
cmakeArgs.add("-DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc-posix")
cmakeArgs.add("-DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++-posix")
cmakeArgs.add("-DCMAKE_RC_COMPILER=x86_64-w64-mingw32-windres")
cmakeArgs.add("-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER")
cmakeArgs.add("-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY")
Expand Down Expand Up @@ -280,8 +301,8 @@ proc x265Build(buildPath: string, crossWindows: bool = false) =
# Add cross-compilation flags if needed
if crossWindows:
cmakeArgs.add("-DCMAKE_SYSTEM_NAME=Windows")
cmakeArgs.add("-DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc")
cmakeArgs.add("-DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++")
cmakeArgs.add("-DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc-posix")
cmakeArgs.add("-DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++-posix")
cmakeArgs.add("-DCMAKE_RC_COMPILER=x86_64-w64-mingw32-windres")
cmakeArgs.add("-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER")
cmakeArgs.add("-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY")
Expand Down Expand Up @@ -310,8 +331,8 @@ proc x265Build(buildPath: string, crossWindows: bool = false) =
# Add cross-compilation flags if needed
if crossWindows:
cmakeArgs.add("-DCMAKE_SYSTEM_NAME=Windows")
cmakeArgs.add("-DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc")
cmakeArgs.add("-DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++")
cmakeArgs.add("-DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc-posix")
cmakeArgs.add("-DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++-posix")
cmakeArgs.add("-DCMAKE_RC_COMPILER=x86_64-w64-mingw32-windres")
cmakeArgs.add("-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER")
cmakeArgs.add("-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY")
Expand Down Expand Up @@ -377,8 +398,8 @@ proc mesonBuild(buildPath: string, crossWindows: bool = false) =
let crossFile = "build_meson/meson-cross.txt"
writeFile(crossFile, """
[binaries]
c = 'x86_64-w64-mingw32-gcc'
cpp = 'x86_64-w64-mingw32-g++'
c = 'x86_64-w64-mingw32-gcc-posix'
cpp = 'x86_64-w64-mingw32-g++-posix'
ar = 'x86_64-w64-mingw32-ar'
strip = 'x86_64-w64-mingw32-strip'
pkgconfig = 'x86_64-w64-mingw32-pkg-config'
Expand Down Expand Up @@ -419,7 +440,7 @@ proc ffmpegSetup(crossWindows: bool) =
exec &"tar {tarArgs} {package.location} && mv {package.dirName} {package.name}"
let patchFile = &"../patches/{package.name}.patch"
if fileExists(patchFile):
let cmd = &"patch -d {package.name} -i {absolutePath(patchFile)} -p1"
let cmd = &"patch -d {package.name} -i {absolutePath(patchFile)} -p1 --force"
echo "Applying patch: ", cmd
exec cmd

Expand All @@ -446,7 +467,7 @@ proc ffmpegSetup(crossWindows: bool) =
args.add("--target=x86_64-win64-gcc")
else:
args.add("--host=x86_64-w64-mingw32")
envPrefix = "CC=x86_64-w64-mingw32-gcc CXX=x86_64-w64-mingw32-g++ AR=x86_64-w64-mingw32-ar STRIP=x86_64-w64-mingw32-strip RANLIB=x86_64-w64-mingw32-ranlib "
envPrefix = "CC=x86_64-w64-mingw32-gcc-posix CXX=x86_64-w64-mingw32-g++-posix AR=x86_64-w64-mingw32-ar STRIP=x86_64-w64-mingw32-strip RANLIB=x86_64-w64-mingw32-ranlib "
let cmd = &"{envPrefix}./configure --prefix=\"{buildPath}\" --disable-shared --enable-static " & args.join(" ")
echo "RUN: ", cmd
exec cmd
Expand All @@ -464,7 +485,7 @@ var commonFlags = &"""
--disable-xlib \
--disable-bsfs \
--disable-filters \
--enable-filter=scale,pad,format,gblur,aformat,abuffer,abuffersink,aresample,atempo,anull,anullsrc,volume \
--enable-filter=whisper,scale,pad,format,gblur,aformat,abuffer,abuffersink,aresample,atempo,anull,anullsrc,volume \
--disable-encoder={encodersDisabled} \
--disable-decoder={decodersDisabled} \
--disable-demuxer={demuxersDisabled} \
Expand Down Expand Up @@ -499,10 +520,20 @@ task makeff, "Build FFmpeg from source":
when defined(linux):
pkgConfigPaths.add(buildPath / "lib/x86_64-linux-gnu/pkgconfig")
pkgConfigPaths.add(buildPath / "lib64/pkgconfig")
# Add common cmake install paths for pkg-config files
pkgConfigPaths.add(buildPath / "lib/cmake")
pkgConfigPaths.add(buildPath / "share/pkgconfig")
putEnv("PKG_CONFIG_PATH", pkgConfigPaths.join(":"))

ffmpegSetup(crossWindows=false)

# Debug: List pkg-config files to verify whisper.pc exists
when defined(linux):
echo "Checking for whisper.pc files:"
exec &"find {buildPath} -name 'whisper.pc' -type f"
echo "Current PKG_CONFIG_PATH: ", getEnv("PKG_CONFIG_PATH")
exec "pkg-config --list-all | grep whisper || echo 'whisper not found in pkg-config'"

# Configure and build FFmpeg
withDir "ffmpeg_sources/ffmpeg":
var ldflags = &"-L{buildPath}/lib"
Expand All @@ -529,7 +560,7 @@ task makeffwin, "Build FFmpeg for Windows cross-compilation":
when defined(linux):
ldflags &= &" -L{buildPath}/lib/x86_64-linux-gnu -L{buildPath}/lib64"

exec (&"""CC=x86_64-w64-mingw32-gcc CXX=x86_64-w64-mingw32-g++ AR=x86_64-w64-mingw32-ar STRIP=x86_64-w64-mingw32-strip RANLIB=x86_64-w64-mingw32-ranlib PKG_CONFIG_PATH="{buildPath}/lib/pkgconfig" ./configure --prefix="{buildPath}" \
exec (&"""CC=x86_64-w64-mingw32-gcc-posix CXX=x86_64-w64-mingw32-g++-posix AR=x86_64-w64-mingw32-ar STRIP=x86_64-w64-mingw32-strip RANLIB=x86_64-w64-mingw32-ranlib PKG_CONFIG_PATH="{buildPath}/lib/pkgconfig" ./configure --prefix="{buildPath}" \
--pkg-config-flags="--static" \
--extra-cflags="-I{buildPath}/include" \
--extra-ldflags="{ldflags}" \
Expand All @@ -545,9 +576,9 @@ task windows, "Cross-compile to Windows (requires mingw-w64)":
if not dirExists("build"):
echo "FFmpeg for Windows not found. Run 'nimble makeffwin' first."
else:
exec "nim c -d:danger " & disableHevc & " --os:windows --cpu:amd64 --cc:gcc " &
"--gcc.exe:x86_64-w64-mingw32-gcc " &
"--gcc.linkerexe:x86_64-w64-mingw32-gcc " &
exec "nim c -d:danger " & flags & " --os:windows --cpu:amd64 --cc:gcc " &
"--gcc.exe:x86_64-w64-mingw32-gcc-posix " &
"--gcc.linkerexe:x86_64-w64-mingw32-gcc-posix " &
"--passL:-lbcrypt " & # Add Windows Bcrypt library
"--passL:-lstdc++ " & # Add C++ standard library
"--passL:-static " &
Expand Down
13 changes: 13 additions & 0 deletions patches/whisper.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
--- a/cmake/whisper.pc.in
+++ b/cmake/whisper.pc.in
@@ -6,5 +6,9 @@ includedir=${prefix}/include
Name: whisper
Description: Port of OpenAI's Whisper model in C/C++
Version: @PROJECT_VERSION@
-Libs: -L${libdir} -lggml -lggml-base -lwhisper
+Libs: -L${libdir} -lwhisper -lggml -lggml-cpu -lggml-blas -lggml-metal -lggml-base
+Libs.private: -framework Accelerate -framework Metal -framework MetalKit -framework Foundation -lstdc++ -lc++
Cflags: -I${includedir}
+
+Requires:
+Conflicts:
108 changes: 108 additions & 0 deletions src/cmds/whisper.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import std/strformat
import ../ffmpeg
import ../av
import ../log

proc main*(args: seq[string]) =
if args.len < 1:
echo "Whisper front-end"
quit(0)

let inputPath = args[0]
let model = args[1]

av_log_set_level(AV_LOG_QUIET)

let input = av.open(inputPath)
defer: input.close()

# Use the `whisper` filter. From audio stream 0, print out the new subtitles

# Find audio stream
var audioStreamIndex = -1
for i in 0..<input.streams.len:
if input.streams[i].codecpar.codec_type == AVMEDIA_TYPE_AUDIO:
audioStreamIndex = i
break

if audioStreamIndex == -1:
echo "No audio stream found"
quit(1)

let filterGraph = avfilter_graph_alloc()
defer: avfilter_graph_free(addr filterGraph)

# Create buffer source for audio input
let abuffer = avfilter_get_by_name("abuffer")
var bufferCtx: ptr AVFilterContext

let audioStream = input.streams[audioStreamIndex]
let sampleRate = audioStream.codecpar.sample_rate
let channelLayout = audioStream.codecpar.ch_layout.u.mask
let sampleFormat = cast[AVSampleFormat](audioStream.codecpar.format)

# Get sample format name
let sampleFmtName = av_get_sample_fmt_name(cint(sampleFormat))
let bufferArgs = "sample_rate=" & $sampleRate & ":sample_fmt=" & $sampleFmtName & ":channel_layout=" & $channelLayout

if avfilter_graph_create_filter(addr bufferCtx, abuffer, "in", bufferArgs, nil, filterGraph) < 0:
echo "Failed to create buffer source"
quit(1)

# Create whisper filter - whisper filter outputs subtitles to stderr/stdout by default
let whisperFilter = avfilter_get_by_name("whisper")
var whisperCtx: ptr AVFilterContext
# Don't use destination - let whisper output as info messages and set frame metadata
let whisperArgs = "model=" & model

if avfilter_graph_create_filter(addr whisperCtx, whisperFilter, "whisper", whisperArgs, nil, filterGraph) < 0:
error &"Failed to create whisper filter with model: {model}"

# Create buffer sink
let abuffersink = avfilter_get_by_name("abuffersink")
var sinkCtx: ptr AVFilterContext

if avfilter_graph_create_filter(addr sinkCtx, abuffersink, "out", nil, nil, filterGraph) < 0:
error "Failed to create buffer sink"

# Link filters: buffer -> whisper -> sink
if avfilter_link(bufferCtx, 0, whisperCtx, 0) < 0:
error "Failed to link buffer to whisper"
if avfilter_link(whisperCtx, 0, sinkCtx, 0) < 0:
error "Failed to link whisper to sink"

if avfilter_graph_config(filterGraph, nil) < 0:
error "Failed to configure filter graph"

# Set up decoder for the audio stream
let decoderCtx = initDecoder(audioStream.codecpar)
defer: avcodec_free_context(addr decoderCtx)

let frame = av_frame_alloc()
defer: av_frame_free(addr frame)

let outputFrame = av_frame_alloc()
defer: av_frame_free(addr outputFrame)

for decodedFrame in input.decode(cint(audioStreamIndex), decoderCtx, frame):
if av_buffersrc_write_frame(bufferCtx, decodedFrame) < 0:
echo "Error feeding frame to filter"
continue

# Try to get output from whisper filter - check frame metadata for subtitle text
while av_buffersink_get_frame_flags(sinkCtx, outputFrame, 0) >= 0:
# Check frame metadata for whisper text
if outputFrame.metadata != nil:
let whisperTextEntry = av_dict_get(outputFrame.metadata, "lavfi.whisper.text", nil, 0)
if whisperTextEntry != nil and whisperTextEntry.value != nil:
echo $whisperTextEntry.value
av_frame_unref(outputFrame)

# Flush the filter
if av_buffersrc_write_frame(bufferCtx, nil) >= 0:
while av_buffersink_get_frame_flags(sinkCtx, outputFrame, 0) >= 0:
if outputFrame.metadata != nil:
let whisperTextEntry = av_dict_get(outputFrame.metadata, "lavfi.whisper.text", nil, 0)
if whisperTextEntry != nil and whisperTextEntry.value != nil:
echo $whisperTextEntry.value
av_frame_unref(outputFrame)
14 changes: 12 additions & 2 deletions src/ffmpeg.nim
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,20 @@ when defined(linux):
{.passL: "-L./build/lib -lavfilter -lavformat -lavcodec -lswresample -lswscale -lavutil".}
{.passL: "-lmp3lame -lopus -lvpx -lx264 -ldav1d -lSvtAv1Enc".}

when not defined(disable_hevc):
{.passL: "-lx265".}
when defined(enable_whisper):
{.passL: "-lwhisper -lggml -lggml-cpu -lggml-blas -lggml-metal -lggml-base".}
when defined(macosx):
{.passL: "-framework Accelerate -framework Metal -framework MetalKit -framework Foundation".}

when defined(enable_hevc):
{.passL: "-lx265".}

when defined(enable_hevc) or defined(enable_whisper):
when defined(macosx): # C++ linkers
{.passL: "-lc++"}
else:
{.passL: "-lstdc++"}

{.passL: "-lm".}

import std/posix
Expand Down Expand Up @@ -728,6 +736,8 @@ proc avfilter_link*(src: ptr AVFilterContext, srcpad: cuint,
# Filter lookup
proc avfilter_get_by_name*(name: cstring): ptr AVFilter {.importc,
header: "<libavfilter/avfilter.h>".}
proc av_filter_iterate*(opaque: ptr pointer): ptr AVFilter {.importc,
header: "<libavfilter/avfilter.h>".}

# Filter input/output management
proc avfilter_inout_alloc*(): ptr AVFilterInOut {.importc,
Expand Down
Loading
Loading