fix(web-test): fix audio-video sync drift in narration

Screencast frame duplication (Math.round) causes video to be ~5% longer
than wall-clock time. Caption timestamps are wall-clock based, so the
audio track drifted ahead by ~8s at the midpoint of a 5-minute video.

Fix:
- stopRecording() saves recordingDuration in captions.json
- addNarration() reads actual video duration via ffprobe and scales
  caption timestamps by videoDuration/recordingDuration ratio
- Phase 2 timeline now tracks actual cumulative position instead of
  computing gaps from previous caption data (prevents MP3 frame
  quantization drift)
- Also fixed findFfmpeg() → resolveFfmpeg() call in addNarration

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Nick Shirokov
2026-03-03 00:45:30 +03:00
parent 7f6ea32533
commit 5acb586bbf
+33 -19
View File
@@ -2552,7 +2552,8 @@ export async function stopRecording() {
lastCaptions = recorder.captions || [];
if (lastCaptions.length) {
const captionsPath = outputPath.replace(/\.[^.]+$/, '.captions.json');
writeFileSync(captionsPath, JSON.stringify(lastCaptions, null, 2), 'utf-8');
const captionsData = { recordingDuration: duration, captions: lastCaptions };
writeFileSync(captionsPath, JSON.stringify(captionsData, null, 2), 'utf-8');
}
recorder = null;
@@ -2645,25 +2646,43 @@ export function getCaptions() {
* @returns {{ file: string, duration: number, size: number, captions: number, warnings?: string[] }}
*/
export async function addNarration(videoPath, opts = {}) {
const ffmpegPath = opts.ffmpegPath || findFfmpeg();
const ffmpegPath = resolveFfmpeg(opts.ffmpegPath);
const ttsProvider = getTtsProvider(opts.provider || 'edge');
const ttsOpts = { voice: opts.voice, apiKey: opts.apiKey, apiUrl: opts.apiUrl, model: opts.model };
// Resolve captions: explicit > lastCaptions > .captions.json
let captions = opts.captions;
let recordingDuration = null; // wall-clock duration of the recording (seconds)
if (!captions || !captions.length) {
captions = lastCaptions.length ? [...lastCaptions] : null;
}
if (!captions || !captions.length) {
const captionsJsonPath = videoPath.replace(/\.[^.]+$/, '.captions.json');
if (fsExistsSync(captionsJsonPath)) {
captions = JSON.parse(readFileSync(captionsJsonPath, 'utf-8'));
const raw = JSON.parse(readFileSync(captionsJsonPath, 'utf-8'));
// Support both formats: array (old) and { recordingDuration, captions } (new)
if (Array.isArray(raw)) {
captions = raw;
} else {
captions = raw.captions;
recordingDuration = raw.recordingDuration || null;
}
}
}
if (!captions || !captions.length) {
throw new Error('No captions available. Record with showCaption() first, or pass opts.captions.');
}
// Scale caption timestamps to match actual video duration
// (screencast frame duplication can cause video to be longer than wall-clock time)
const videoDuration = getAudioDuration(videoPath, ffmpegPath);
if (recordingDuration && recordingDuration > 0) {
const timeScale = videoDuration / recordingDuration;
if (Math.abs(timeScale - 1) > 0.005) { // only scale if >0.5% difference
captions = captions.map(c => ({ ...c, time: Math.round(c.time * timeScale) }));
}
}
// Output path
const ext = extname(videoPath);
const base = videoPath.slice(0, -ext.length);
@@ -2703,34 +2722,27 @@ export async function addNarration(videoPath, opts = {}) {
}
// Phase 2: Build timeline — interleave silence gaps and TTS segments
// Track actual accumulated position to prevent drift from MP3 frame quantization
const segments = []; // { file, type: 'silence'|'tts' }
let currentPosition = 0; // actual accumulated duration in seconds
for (let i = 0; i < captions.length; i++) {
const captionTimeMs = captions[i].time;
const captionTimeSec = captions[i].time / 1000;
const ttsFile = ttsFiles[i];
const ttsDuration = getAudioDuration(ttsFile, ffmpegPath);
// Calculate gap before this caption
let gapStart;
if (i === 0) {
gapStart = 0;
} else {
const prevCaptionTimeMs = captions[i - 1].time;
const prevTtsDuration = getAudioDuration(ttsFiles[i - 1], ffmpegPath);
gapStart = prevCaptionTimeMs / 1000 + prevTtsDuration;
}
const gapDuration = captionTimeMs / 1000 - gapStart;
if (gapDuration > 0.05) {
// Add silence to reach this caption's target timestamp
const silenceDuration = captionTimeSec - currentPosition;
if (silenceDuration > 0.05) {
const silenceFile = pathJoin(tempDir, `silence_${i}.mp3`);
generateSilence(silenceFile, gapDuration, ffmpegPath);
generateSilence(silenceFile, silenceDuration, ffmpegPath);
segments.push({ file: silenceFile, type: 'silence' });
currentPosition += getAudioDuration(silenceFile, ffmpegPath);
}
// Speed up TTS if it's longer than gap to next caption (instead of trimming)
if (i < captions.length - 1) {
const nextTimeMs = captions[i + 1].time;
const maxDuration = (nextTimeMs - captionTimeMs) / 1000;
const maxDuration = (captions[i + 1].time - captions[i].time) / 1000;
if (ttsDuration > maxDuration && maxDuration > 0.1) {
const tempo = ttsDuration / maxDuration;
const spedFile = pathJoin(tempDir, `tts_${i}_sped.mp3`);
@@ -2739,11 +2751,13 @@ export async function addNarration(videoPath, opts = {}) {
'-c:a', 'libmp3lame', '-b:a', '128k', spedFile,
], { stdio: 'pipe', timeout: 10000 });
segments.push({ file: spedFile, type: 'tts' });
currentPosition += getAudioDuration(spedFile, ffmpegPath);
continue;
}
}
segments.push({ file: ttsFile, type: 'tts' });
currentPosition += ttsDuration;
}
// Phase 3: Concat all segments into a single narration track