智能添加字幕-autosub
智能添加字幕-autosub
ytkz主方法:generate_subtitles
提取音频文件
generate_subtitles调用extract_audio,提取音频
audio_filename, audio_rate = extract_audio(source_path)
extract_audio函数如下
def extract_audio(filename, channels=1, rate=16000):
"""
Extract audio from an input file to a temporary WAV file.
"""
temp = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
if not os.path.isfile(filename):
print("The given file does not exist: {}".format(filename))
raise Exception("Invalid filepath: {}".format(filename))
if not which("ffmpeg") and not which("ffmpeg.exe"):
print("ffmpeg: Executable not found on machine.")
raise Exception("Dependency not found: ffmpeg")
command = ["ffmpeg", "-y", "-i", filename,
"-ac", str(channels), "-ar", str(rate),
"-loglevel", "error", temp.name]
use_shell = True if os.name == "nt" else False
subprocess.check_output(command, stdin=open(os.devnull), shell=use_shell)
return temp.name, rate
调用ffmpeg,生成临时temp音频文件
提取音频区域
从临时temp音频文件中分析说话的区域
regions = find_speech_regions(audio_filename)
def find_speech_regions(filename, frame_width=4096, min_region_size=0.5, max_region_size=6): # pylint: disable=too-many-locals
"""
Perform voice activity detection on a given audio file.
"""
reader = wave.open(filename)
sample_width = reader.getsampwidth()
rate = reader.getframerate()
n_channels = reader.getnchannels()
chunk_duration = float(frame_width) / rate
n_chunks = int(math.ceil(reader.getnframes()*1.0 / frame_width))
energies = []
for _ in range(n_chunks):
chunk = reader.readframes(frame_width)
energies.append(audioop.rms(chunk, sample_width * n_channels))
threshold = percentile(energies, 0.2)
elapsed_time = 0
regions = []
region_start = None
for energy in energies:
is_silence = energy <= threshold
max_exceeded = region_start and elapsed_time - region_start >= max_region_size
if (max_exceeded or is_silence) and region_start:
if elapsed_time - region_start >= min_region_size:
regions.append((region_start, elapsed_time))
region_start = None
elif (not region_start) and (not is_silence):
region_start = elapsed_time
elapsed_time += chunk_duration
return regions
多线程将speech-to-text
用于将输入音频或视频文件的区域转换为FLAC音频文件
class FLACConverter(object): # pylint: disable=too-few-public-methods
"""
Class for converting a region of an input audio or video file into a FLAC audio file
用于将输入音频或视频文件的区域转换为FLAC音频文件
"""
def __init__(self, source_path, include_before=0.25, include_after=0.25):
self.source_path = source_path
self.include_before = include_before
self.include_after = include_after
def __call__(self, region):
try:
start, end = region
start = max(0, start - self.include_before)
end += self.include_after
temp = tempfile.NamedTemporaryFile(suffix='.flac', delete=False)
command = ["ffmpeg", "-ss", str(start), "-t", str(end - start),
"-y", "-i", self.source_path,
"-loglevel", "error", temp.name]
use_shell = True if os.name == "nt" else False
subprocess.check_output(command, stdin=open(os.devnull), shell=use_shell)
read_data = temp.read()
temp.close()
os.unlink(temp.name)
return read_data
except KeyboardInterrupt:
return None
class SpeechRecognizer(object): # pylint: disable=too-few-public-methods
"""
Class for performing speech-to-text for an input FLAC file.
"""
def __init__(self, language="en", rate=44100, retries=3, api_key=GOOGLE_SPEECH_API_KEY):
self.language = language
self.rate = rate
self.api_key = api_key
self.retries = retries
def __call__(self, data):
try:
for _ in range(self.retries):
url = GOOGLE_SPEECH_API_URL.format(lang=self.language, key=self.api_key)
headers = {"Content-Type": "audio/x-flac; rate=%d" % self.rate}
try:
resp = requests.post(url, data=data, headers=headers)
except requests.exceptions.ConnectionError:
continue
for line in resp.content.decode('utf-8').split("\n"):
try:
line = json.loads(line)
line = line['result'][0]['alternative'][0]['transcript']
return line[:1].upper() + line[1:]
except IndexError:
# no result
continue
except JSONDecodeError:
continue
except KeyboardInterrupt:
return None