4
4
import re
5
5
from dataclasses import dataclass
6
6
from fractions import Fraction
7
+ from math import ceil
7
8
from typing import TYPE_CHECKING
8
9
9
10
import av
10
11
import numpy as np
12
+ from av .audio .fifo import AudioFifo
11
13
from av .subtitles .subtitle import AssSubtitle
12
14
13
15
from auto_editor import version
14
16
from auto_editor .utils .subtitle_tools import convert_ass_to_text
15
- from auto_editor .wavfile import read
16
17
17
18
if TYPE_CHECKING :
18
19
from collections .abc import Iterator
22
23
from numpy .typing import NDArray
23
24
24
25
from auto_editor .ffwrapper import FileInfo
25
- from auto_editor .output import Ensure
26
26
from auto_editor .utils .bar import Bar
27
27
from auto_editor .utils .log import Log
28
28
29
29
30
30
@dataclass (slots = True )
31
31
class FileSetup :
32
32
src : FileInfo
33
- ensure : Ensure
34
33
strict : bool
35
34
tb : Fraction
36
35
bar : Bar
@@ -89,6 +88,41 @@ def obj_tag(tag: str, tb: Fraction, obj: dict[str, Any]) -> str:
89
88
return key
90
89
91
90
91
+ def iter_audio (src , tb : Fraction , stream : int = 0 ) -> Iterator [float ]:
92
+ fifo = AudioFifo ()
93
+ try :
94
+ container = av .open (src .path , "r" )
95
+ audio_stream = container .streams .audio [stream ]
96
+ sample_rate = audio_stream .rate
97
+
98
+ exact_size = (1 / tb ) * sample_rate
99
+ accumulated_error = 0
100
+
101
+ # Resample so that audio data is between [-1, 1]
102
+ resampler = av .AudioResampler (
103
+ av .AudioFormat ("flt" ), audio_stream .layout , sample_rate
104
+ )
105
+
106
+ for frame in container .decode (audio = stream ):
107
+ frame .pts = None # Skip time checks
108
+
109
+ for reframe in resampler .resample (frame ):
110
+ fifo .write (reframe )
111
+
112
+ while fifo .samples >= ceil (exact_size ):
113
+ size_with_error = exact_size + accumulated_error
114
+ current_size = round (size_with_error )
115
+ accumulated_error = size_with_error - current_size
116
+
117
+ audio_chunk = fifo .read (current_size )
118
+ assert audio_chunk is not None
119
+ arr = audio_chunk .to_ndarray ().flatten ()
120
+ yield float (np .max (np .abs (arr )))
121
+
122
+ finally :
123
+ container .close ()
124
+
125
+
92
126
def iter_motion (src , tb , stream : int , blur : int , width : int ) -> Iterator [float ]:
93
127
container = av .open (src .path , "r" )
94
128
@@ -138,7 +172,6 @@ def iter_motion(src, tb, stream: int, blur: int, width: int) -> Iterator[float]:
138
172
139
173
@dataclass (slots = True )
140
174
class Levels :
141
- ensure : Ensure
142
175
src : FileInfo
143
176
tb : Fraction
144
177
bar : Bar
@@ -151,24 +184,16 @@ def media_length(self) -> int:
151
184
if (arr := self .read_cache ("audio" , {"stream" : 0 })) is not None :
152
185
return len (arr )
153
186
154
- sr , samples = read (self .ensure .audio (self .src , 0 ))
155
- samp_count = len (samples )
156
- del samples
157
-
158
- samp_per_ticks = sr / self .tb
159
- ticks = int (samp_count / samp_per_ticks )
160
- self .log .debug (f"Audio Length: { ticks } " )
161
- self .log .debug (
162
- f"... without rounding: { float (samp_count / samp_per_ticks )} "
163
- )
164
- return ticks
187
+ result = sum (1 for _ in iter_audio (self .src , self .tb , 0 ))
188
+ self .log .debug (f"Audio Length: { result } " )
189
+ return result
165
190
166
191
# If there's no audio, get length in video metadata.
167
- with av .open (f" { self .src .path } " ) as cn :
168
- if len (cn .streams .video ) < 1 :
192
+ with av .open (self .src .path ) as container :
193
+ if len (container .streams .video ) == 0 :
169
194
self .log .error ("Could not get media duration" )
170
195
171
- video = cn .streams .video [0 ]
196
+ video = container .streams .video [0 ]
172
197
173
198
if video .duration is None or video .time_base is None :
174
199
dur = 0
@@ -213,56 +238,70 @@ def cache(self, tag: str, obj: dict[str, Any], arr: np.ndarray) -> np.ndarray:
213
238
return arr
214
239
215
240
def audio (self , stream : int ) -> NDArray [np .float64 ]:
216
- if stream > len (self .src .audios ) - 1 :
241
+ if stream >= len (self .src .audios ):
217
242
raise LevelError (f"audio: audio stream '{ stream } ' does not exist." )
218
243
219
244
if (arr := self .read_cache ("audio" , {"stream" : stream })) is not None :
220
245
return arr
221
246
222
- sr , samples = read (self .ensure .audio (self .src , stream ))
223
-
224
- if len (samples ) == 0 :
225
- raise LevelError (f"audio: stream '{ stream } ' has no samples." )
226
-
227
- def get_max_volume (s : np .ndarray ) -> float :
228
- return max (float (np .max (s )), - float (np .min (s )))
229
-
230
- max_volume = get_max_volume (samples )
231
- self .log .debug (f"Max volume: { max_volume } " )
247
+ with av .open (self .src .path , "r" ) as container :
248
+ audio = container .streams .audio [stream ]
249
+ if audio .duration is not None and audio .time_base is not None :
250
+ inaccurate_dur = int (audio .duration * audio .time_base * self .tb )
251
+ elif container .duration is not None :
252
+ inaccurate_dur = int (container .duration / av .time_base * self .tb )
253
+ else :
254
+ inaccurate_dur = 1024
232
255
233
- samp_count = samples . shape [ 0 ]
234
- samp_per_ticks = sr / self . tb
256
+ bar = self . bar
257
+ bar . start ( inaccurate_dur , "Analyzing audio volume" )
235
258
236
- if samp_per_ticks < 1 :
237
- self .log .error (
238
- f"audio: stream '{ stream } '\n Samplerate ({ sr } ) must be greater than "
239
- f"or equal to timebase ({ self .tb } )\n "
240
- " Try `-fps 30` and/or `--sample-rate 48000`"
241
- )
259
+ result = np .zeros ((inaccurate_dur ), dtype = np .float64 )
260
+ index = 0
261
+ for value in iter_audio (self .src , self .tb , stream ):
262
+ if index > len (result ) - 1 :
263
+ result = np .concatenate (
264
+ (result , np .zeros ((len (result )), dtype = np .float64 ))
265
+ )
266
+ result [index ] = value
267
+ bar .tick (index )
268
+ index += 1
242
269
243
- audio_ticks = int (samp_count / samp_per_ticks )
244
- self .log .debug (
245
- f"analyze: audio length: { audio_ticks } ({ float (samp_count / samp_per_ticks )} )"
246
- )
247
- self .bar .start (audio_ticks , "Analyzing audio volume" )
270
+ bar .end ()
271
+ return self .cache ("audio" , {"stream" : stream }, result [:index ])
248
272
249
- threshold_list = np .zeros ((audio_ticks ), dtype = np .float64 )
273
+ def motion (self , stream : int , blur : int , width : int ) -> NDArray [np .float64 ]:
274
+ if stream >= len (self .src .videos ):
275
+ raise LevelError (f"motion: video stream '{ stream } ' does not exist." )
250
276
251
- if max_volume == 0 : # Prevent dividing by zero
252
- return threshold_list
277
+ mobj = {"stream" : stream , "width" : width , "blur" : blur }
278
+ if (arr := self .read_cache ("motion" , mobj )) is not None :
279
+ return arr
253
280
254
- # Determine when audio is silent or loud.
255
- for i in range (audio_ticks ):
256
- if i % 500 == 0 :
257
- self .bar .tick (i )
281
+ with av .open (self .src .path , "r" ) as container :
282
+ video = container .streams .video [stream ]
283
+ inaccurate_dur = (
284
+ 1024
285
+ if video .duration is None or video .time_base is None
286
+ else int (video .duration * video .time_base * self .tb )
287
+ )
258
288
259
- start = int ( i * samp_per_ticks )
260
- end = min ( int (( i + 1 ) * samp_per_ticks ), samp_count )
289
+ bar = self . bar
290
+ bar . start ( inaccurate_dur , "Analyzing motion" )
261
291
262
- threshold_list [i ] = get_max_volume (samples [start :end ]) / max_volume
292
+ result = np .zeros ((inaccurate_dur ), dtype = np .float64 )
293
+ index = 0
294
+ for value in iter_motion (self .src , self .tb , stream , blur , width ):
295
+ if index > len (result ) - 1 :
296
+ result = np .concatenate (
297
+ (result , np .zeros ((len (result )), dtype = np .float64 ))
298
+ )
299
+ result [index ] = value
300
+ bar .tick (index )
301
+ index += 1
263
302
264
- self . bar .end ()
265
- return self .cache ("audio " , { "stream" : stream }, threshold_list )
303
+ bar .end ()
304
+ return self .cache ("motion " , mobj , result [: index ] )
266
305
267
306
def subtitle (
268
307
self ,
@@ -336,37 +375,3 @@ def subtitle(
336
375
container .close ()
337
376
338
377
return result
339
-
340
- def motion (self , stream : int , blur : int , width : int ) -> NDArray [np .float64 ]:
341
- if stream >= len (self .src .videos ):
342
- raise LevelError (f"motion: video stream '{ stream } ' does not exist." )
343
-
344
- mobj = {"stream" : stream , "width" : width , "blur" : blur }
345
- if (arr := self .read_cache ("motion" , mobj )) is not None :
346
- return arr
347
-
348
- with av .open (self .src .path , "r" ) as container :
349
- video = container .streams .video [stream ]
350
- inaccurate_dur = (
351
- 1024
352
- if video .duration is None or video .time_base is None
353
- else int (video .duration * video .time_base * self .tb )
354
- )
355
-
356
- bar = self .bar
357
- bar .start (inaccurate_dur , "Analyzing motion" )
358
-
359
- threshold_list = np .zeros ((inaccurate_dur ), dtype = np .float64 )
360
- index = 0
361
-
362
- for value in iter_motion (self .src , self .tb , stream , blur , width ):
363
- if index > len (threshold_list ) - 1 :
364
- threshold_list = np .concatenate (
365
- (threshold_list , np .zeros ((len (threshold_list )), dtype = np .float64 ))
366
- )
367
- threshold_list [index ] = value
368
- bar .tick (index )
369
- index += 1
370
-
371
- bar .end ()
372
- return self .cache ("motion" , mobj , threshold_list [:index ])
0 commit comments