mirror of
https://github.com/lfnovo/open-notebook.git
synced 2026-04-29 20:10:07 +00:00
add async content processing
This commit is contained in:
parent
ac2ea9e554
commit
00f070a644
10 changed files with 541 additions and 395 deletions
|
|
@ -1,114 +1,141 @@
|
|||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from functools import partial
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from open_notebook.graphs.content_processing.state import ContentState
|
||||
|
||||
|
||||
def extract_audio_from_video(input_file, output_file, stream_index):
|
||||
async def extract_audio_from_video(input_file, output_file, stream_index):
|
||||
"""
|
||||
Extract the specified audio stream to MP3 format
|
||||
Extract the specified audio stream to MP3 format asynchronously
|
||||
"""
|
||||
try:
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-i",
|
||||
input_file,
|
||||
"-map",
|
||||
f"0:a:{stream_index}", # Select specific audio stream
|
||||
"-codec:a",
|
||||
"libmp3lame", # Use MP3 codec
|
||||
"-q:a",
|
||||
"2", # High quality setting
|
||||
"-y", # Overwrite output file if exists
|
||||
output_file,
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise Exception(f"FFmpeg failed: {result.stderr}")
|
||||
def _extract(input_file, output_file, stream_index):
|
||||
try:
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-i",
|
||||
input_file,
|
||||
"-map",
|
||||
f"0:a:{stream_index}", # Select specific audio stream
|
||||
"-codec:a",
|
||||
"libmp3lame", # Use MP3 codec
|
||||
"-q:a",
|
||||
"2", # High quality setting
|
||||
"-y", # Overwrite output file if exists
|
||||
output_file,
|
||||
]
|
||||
|
||||
return True
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise Exception(f"FFmpeg failed: {result.stderr}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting audio: {str(e)}")
|
||||
return False
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting audio: {str(e)}")
|
||||
return False
|
||||
|
||||
return await asyncio.get_event_loop().run_in_executor(
|
||||
None, partial(_extract, input_file, output_file, stream_index)
|
||||
)
|
||||
|
||||
|
||||
def get_audio_streams(input_file):
|
||||
async def get_audio_streams(input_file):
|
||||
"""
|
||||
Analyze video file and return information about all audio streams
|
||||
Analyze video file and return information about all audio streams asynchronously
|
||||
"""
|
||||
logger.debug(f"Analyzing video file {input_file} for audio streams")
|
||||
try:
|
||||
# Get stream information in JSON format
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"quiet",
|
||||
"-print_format",
|
||||
"json",
|
||||
"-show_streams",
|
||||
"-select_streams",
|
||||
"a",
|
||||
input_file,
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise Exception(f"FFprobe failed: {result.stderr}")
|
||||
def _analyze(input_file):
|
||||
logger.debug(f"Analyzing video file {input_file} for audio streams")
|
||||
try:
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"quiet",
|
||||
"-print_format",
|
||||
"json",
|
||||
"-show_streams",
|
||||
"-select_streams",
|
||||
"a",
|
||||
input_file,
|
||||
]
|
||||
|
||||
data = json.loads(result.stdout)
|
||||
return data.get("streams", [])
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise Exception(f"FFprobe failed: {result.stderr}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error analyzing file: {str(e)}")
|
||||
return []
|
||||
data = json.loads(result.stdout)
|
||||
return data.get("streams", [])
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error analyzing file: {str(e)}")
|
||||
return []
|
||||
|
||||
return await asyncio.get_event_loop().run_in_executor(
|
||||
None, partial(_analyze, input_file)
|
||||
)
|
||||
|
||||
|
||||
def select_best_audio_stream(streams):
|
||||
async def select_best_audio_stream(streams):
|
||||
"""
|
||||
Select the best audio stream based on various quality metrics
|
||||
"""
|
||||
if not streams:
|
||||
logger.debug("No audio streams found")
|
||||
return None
|
||||
else:
|
||||
logger.debug(f"Found {len(streams)} audio streams")
|
||||
|
||||
# Score each stream based on various factors
|
||||
scored_streams = []
|
||||
for stream in streams:
|
||||
score = 0
|
||||
def _select(streams):
|
||||
if not streams:
|
||||
logger.debug("No audio streams found")
|
||||
return None
|
||||
else:
|
||||
logger.debug(f"Found {len(streams)} audio streams")
|
||||
|
||||
# Prefer higher bit rates
|
||||
bit_rate = stream.get("bit_rate")
|
||||
if bit_rate:
|
||||
score += int(int(bit_rate) / 1000000) # Convert to Mbps and ensure int
|
||||
# Score each stream based on various factors
|
||||
scored_streams = []
|
||||
for stream in streams:
|
||||
score = 0
|
||||
|
||||
# Prefer more channels (stereo over mono)
|
||||
channels = stream.get("channels", 0)
|
||||
score += channels * 10
|
||||
# Prefer higher bit rates
|
||||
bit_rate = stream.get("bit_rate")
|
||||
if bit_rate:
|
||||
score += int(int(bit_rate) / 1000000) # Convert to Mbps and ensure int
|
||||
|
||||
# Prefer higher sample rates
|
||||
sample_rate = stream.get("sample_rate", "0")
|
||||
score += int(int(sample_rate) / 48000)
|
||||
# Prefer more channels (stereo over mono)
|
||||
channels = stream.get("channels", 0)
|
||||
score += channels * 10
|
||||
|
||||
scored_streams.append((score, stream))
|
||||
# Prefer higher sample rates
|
||||
sample_rate = stream.get("sample_rate", "0")
|
||||
score += int(int(sample_rate) / 48000)
|
||||
|
||||
# Return the stream with highest score
|
||||
return max(scored_streams, key=lambda x: x[0])[1]
|
||||
scored_streams.append((score, stream))
|
||||
|
||||
# Return the stream with highest score
|
||||
return max(scored_streams, key=lambda x: x[0])[1]
|
||||
|
||||
return await asyncio.get_event_loop().run_in_executor(
|
||||
None, partial(_select, streams)
|
||||
)
|
||||
|
||||
|
||||
def extract_best_audio_from_video(data: ContentState):
|
||||
async def extract_best_audio_from_video(data: ContentState):
|
||||
"""
|
||||
Main function to extract the best audio stream from a video file
|
||||
Main function to extract the best audio stream from a video file asynchronously
|
||||
"""
|
||||
input_file = data.get("file_path")
|
||||
assert input_file is not None, "Input file path must be provided"
|
||||
if not os.path.exists(input_file):
|
||||
|
||||
def _check_file(path):
|
||||
return os.path.exists(path)
|
||||
|
||||
file_exists = await asyncio.get_event_loop().run_in_executor(
|
||||
None, partial(_check_file, input_file)
|
||||
)
|
||||
|
||||
if not file_exists:
|
||||
logger.critical(f"Input file not found: {input_file}")
|
||||
return False
|
||||
|
||||
|
|
@ -116,20 +143,20 @@ def extract_best_audio_from_video(data: ContentState):
|
|||
output_file = f"{base_name}_audio.mp3"
|
||||
|
||||
# Get all audio streams
|
||||
streams = get_audio_streams(input_file)
|
||||
streams = await get_audio_streams(input_file)
|
||||
if not streams:
|
||||
logger.debug("No audio streams found in the file")
|
||||
return False
|
||||
|
||||
# Select best stream
|
||||
best_stream = select_best_audio_stream(streams)
|
||||
best_stream = await select_best_audio_stream(streams)
|
||||
if not best_stream:
|
||||
logger.error("Could not determine best audio stream")
|
||||
return False
|
||||
|
||||
# Extract the selected stream
|
||||
stream_index = streams.index(best_stream)
|
||||
success = extract_audio_from_video(input_file, output_file, stream_index)
|
||||
success = await extract_audio_from_video(input_file, output_file, stream_index)
|
||||
|
||||
if success:
|
||||
logger.debug(f"Successfully extracted audio to: {output_file}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue