src/remux/mp4-remuxer.ts
import AAC from './aac-helper';
import MP4 from './mp4-generator';
import type { HlsEventEmitter } from '../events';
import { Events } from '../events';
import { ErrorTypes, ErrorDetails } from '../errors';
import { logger } from '../utils/logger';
import {
InitSegmentData,
Remuxer,
RemuxerResult,
RemuxedMetadata,
RemuxedTrack,
RemuxedUserdata,
} from '../types/remuxer';
import type {
AudioSample,
AvcSample,
DemuxedAudioTrack,
DemuxedAvcTrack,
DemuxedMetadataTrack,
DemuxedUserdataTrack,
} from '../types/demuxer';
import type { TrackSet } from '../types/track';
import type { SourceBufferName } from '../types/buffer';
import type { Fragment } from '../loader/fragment';
import type { HlsConfig } from '../config';
import { toMsFromMpegTsClock } from '../utils/timescale-conversion';
const MAX_SILENT_FRAME_DURATION = 10 * 1000; // 10 seconds
const AAC_SAMPLES_PER_FRAME = 1024;
const MPEG_AUDIO_SAMPLE_PER_FRAME = 1152;
let chromeVersion: number | null = null;
let safariWebkitVersion: number | null = null;
let requiresPositiveDts: boolean = false;
export default class MP4Remuxer implements Remuxer {
private observer: HlsEventEmitter;
private config: HlsConfig;
private typeSupported: any;
private ISGenerated: boolean = false;
private _initPTS!: number;
private _initDTS!: number;
private nextAvcDts: number | null = null;
private nextAudioPts: number | null = null;
private isAudioContiguous: boolean = false;
private isVideoContiguous: boolean = false;
constructor(
observer: HlsEventEmitter,
config: HlsConfig,
typeSupported,
vendor = ''
) {
this.observer = observer;
this.config = config;
this.typeSupported = typeSupported;
this.ISGenerated = false;
if (chromeVersion === null) {
const userAgent = navigator.userAgent || '';
const result = userAgent.match(/Chrome\/(\d+)/i);
chromeVersion = result ? parseInt(result[1]) : 0;
}
if (safariWebkitVersion === null) {
const result = navigator.userAgent.match(/Safari\/(\d+)/i);
safariWebkitVersion = result ? parseInt(result[1]) : 0;
}
requiresPositiveDts =
(!!chromeVersion && chromeVersion < 75) ||
(!!safariWebkitVersion && safariWebkitVersion < 600);
}
destroy() {}
resetTimeStamp(defaultTimeStamp) {
logger.log('[mp4-remuxer]: initPTS & initDTS reset');
this._initPTS = this._initDTS = defaultTimeStamp;
}
resetNextTimestamp() {
logger.log('[mp4-remuxer]: reset next timestamp');
this.isVideoContiguous = false;
this.isAudioContiguous = false;
}
resetInitSegment() {
logger.log('[mp4-remuxer]: ISGenerated flag reset');
this.ISGenerated = false;
}
getVideoStartPts(videoSamples) {
let rolloverDetected = false;
const startPTS = videoSamples.reduce((minPTS, sample) => {
const delta = sample.pts - minPTS;
if (delta < -4294967296) {
// 2^32, see PTSNormalize for reasoning, but we're hitting a rollover here, and we don't want that to impact the timeOffset calculation
rolloverDetected = true;
return PTSNormalize(minPTS, sample.pts);
} else if (delta > 0) {
return minPTS;
} else {
return sample.pts;
}
}, videoSamples[0].pts);
if (rolloverDetected) {
logger.debug('PTS rollover detected');
}
return startPTS;
}
remux(
audioTrack: DemuxedAudioTrack,
videoTrack: DemuxedAvcTrack,
id3Track: DemuxedMetadataTrack,
textTrack: DemuxedUserdataTrack,
timeOffset: number,
accurateTimeOffset: boolean,
flush: boolean
): RemuxerResult {
let video;
let audio;
let initSegment;
let text;
let id3;
let independent: boolean | undefined;
let audioTimeOffset = timeOffset;
let videoTimeOffset = timeOffset;
// If we're remuxing audio and video progressively, wait until we've received enough samples for each track before proceeding.
// This is done to synchronize the audio and video streams. We know if the current segment will have samples if the "pid"
// parameter is greater than -1. The pid is set when the PMT is parsed, which contains the tracks list.
// However, if the initSegment has already been generated, or we've reached the end of a segment (flush),
// then we can remux one track without waiting for the other.
const hasAudio = audioTrack.pid > -1;
const hasVideo = videoTrack.pid > -1;
const enoughAudioSamples = audioTrack.samples.length > 0;
const enoughVideoSamples = videoTrack.samples.length > 1;
const canRemuxAvc =
((!hasAudio || enoughAudioSamples) &&
(!hasVideo || enoughVideoSamples)) ||
this.ISGenerated ||
flush;
if (canRemuxAvc) {
if (!this.ISGenerated) {
initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
}
const isVideoContiguous = this.isVideoContiguous;
if (
enoughVideoSamples &&
!isVideoContiguous &&
this.config.forceKeyFrameOnDiscontinuity
) {
const length = videoTrack.samples.length;
const firstKeyFrameIndex = findKeyframeIndex(videoTrack.samples);
independent = true;
if (firstKeyFrameIndex > 0) {
logger.warn(
`[mp4-remuxer]: Dropped ${firstKeyFrameIndex} out of ${length} video samples due to a missing keyframe`
);
const startPTS = this.getVideoStartPts(videoTrack.samples);
videoTrack.samples = videoTrack.samples.slice(firstKeyFrameIndex);
videoTrack.dropped += firstKeyFrameIndex;
videoTimeOffset +=
(videoTrack.samples[0].pts - startPTS) /
(videoTrack.timescale || 90000);
} else if (firstKeyFrameIndex === -1) {
logger.warn(
`[mp4-remuxer]: No keyframe found out of ${length} video samples`
);
independent = false;
}
}
if (this.ISGenerated) {
if (enoughAudioSamples && enoughVideoSamples) {
// timeOffset is expected to be the offset of the first timestamp of this fragment (first DTS)
// if first audio DTS is not aligned with first video DTS then we need to take that into account
// when providing timeOffset to remuxAudio / remuxVideo. if we don't do that, there might be a permanent / small
// drift between audio and video streams
const startPTS = this.getVideoStartPts(videoTrack.samples);
const tsDelta =
PTSNormalize(audioTrack.samples[0].pts, startPTS) - startPTS;
const audiovideoTimestampDelta = tsDelta / videoTrack.inputTimeScale;
audioTimeOffset += Math.max(0, audiovideoTimestampDelta);
videoTimeOffset += Math.max(0, -audiovideoTimestampDelta);
}
// Purposefully remuxing audio before video, so that remuxVideo can use nextAudioPts, which is calculated in remuxAudio.
if (enoughAudioSamples) {
// if initSegment was generated without audio samples, regenerate it again
if (!audioTrack.samplerate) {
logger.warn(
'[mp4-remuxer]: regenerate InitSegment as audio detected'
);
initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
delete initSegment.video;
}
audio = this.remuxAudio(
audioTrack,
audioTimeOffset,
this.isAudioContiguous,
accurateTimeOffset,
enoughVideoSamples ? videoTimeOffset : undefined
);
if (enoughVideoSamples) {
const audioTrackLength = audio ? audio.endPTS - audio.startPTS : 0;
// if initSegment was generated without video samples, regenerate it again
if (!videoTrack.inputTimeScale) {
logger.warn(
'[mp4-remuxer]: regenerate InitSegment as video detected'
);
initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
}
video = this.remuxVideo(
videoTrack,
videoTimeOffset,
isVideoContiguous,
audioTrackLength
);
}
} else if (enoughVideoSamples) {
video = this.remuxVideo(
videoTrack,
videoTimeOffset,
isVideoContiguous,
0
);
}
if (video && independent !== undefined) {
video.independent = independent;
}
}
}
// Allow ID3 and text to remux, even if more audio/video samples are required
if (this.ISGenerated) {
if (id3Track.samples.length) {
id3 = this.remuxID3(id3Track, timeOffset);
}
if (textTrack.samples.length) {
text = this.remuxText(textTrack, timeOffset);
}
}
return {
audio,
video,
initSegment,
independent,
text,
id3,
};
}
generateIS(
audioTrack: DemuxedAudioTrack,
videoTrack: DemuxedAvcTrack,
timeOffset
): InitSegmentData | undefined {
const audioSamples = audioTrack.samples;
const videoSamples = videoTrack.samples;
const typeSupported = this.typeSupported;
const tracks: TrackSet = {};
const computePTSDTS = !Number.isFinite(this._initPTS);
let container = 'audio/mp4';
let initPTS: number | undefined;
let initDTS: number | undefined;
let timescale: number | undefined;
if (computePTSDTS) {
initPTS = initDTS = Infinity;
}
if (audioTrack.config && audioSamples.length) {
// let's use audio sampling rate as MP4 time scale.
// rationale is that there is a integer nb of audio frames per audio sample (1024 for AAC)
// using audio sampling rate here helps having an integer MP4 frame duration
// this avoids potential rounding issue and AV sync issue
audioTrack.timescale = audioTrack.samplerate;
logger.log(
`[mp4-remuxer]: audio sampling rate : ${audioTrack.samplerate}`
);
if (!audioTrack.isAAC) {
if (typeSupported.mpeg) {
// Chrome and Safari
container = 'audio/mpeg';
audioTrack.codec = '';
} else if (typeSupported.mp3) {
// Firefox
audioTrack.codec = 'mp3';
}
}
tracks.audio = {
id: 'audio',
container: container,
codec: audioTrack.codec,
initSegment:
!audioTrack.isAAC && typeSupported.mpeg
? new Uint8Array(0)
: MP4.initSegment([audioTrack]),
metadata: {
channelCount: audioTrack.channelCount,
},
};
if (computePTSDTS) {
timescale = audioTrack.inputTimeScale;
// remember first PTS of this demuxing context. for audio, PTS = DTS
initPTS = initDTS =
audioSamples[0].pts - Math.round(timescale * timeOffset);
}
}
if (videoTrack.sps && videoTrack.pps && videoSamples.length) {
// let's use input time scale as MP4 video timescale
// we use input time scale straight away to avoid rounding issues on frame duration / cts computation
videoTrack.timescale = videoTrack.inputTimeScale;
tracks.video = {
id: 'main',
container: 'video/mp4',
codec: videoTrack.codec,
initSegment: MP4.initSegment([videoTrack]),
metadata: {
width: videoTrack.width,
height: videoTrack.height,
},
};
if (computePTSDTS) {
timescale = videoTrack.inputTimeScale;
const startPTS = this.getVideoStartPts(videoSamples);
const startOffset = Math.round(timescale * timeOffset);
initDTS = Math.min(
initDTS as number,
PTSNormalize(videoSamples[0].dts, startPTS) - startOffset
);
initPTS = Math.min(initPTS as number, startPTS - startOffset);
}
}
if (Object.keys(tracks).length) {
this.ISGenerated = true;
if (computePTSDTS) {
this._initPTS = initPTS as number;
this._initDTS = initDTS as number;
}
return {
tracks,
initPTS,
timescale,
};
}
}
remuxVideo(
track: DemuxedAvcTrack,
timeOffset: number,
contiguous: boolean,
audioTrackLength: number
): RemuxedTrack | undefined {
const timeScale: number = track.inputTimeScale;
const inputSamples: Array<AvcSample> = track.samples;
const outputSamples: Array<Mp4Sample> = [];
const nbSamples: number = inputSamples.length;
const initPTS: number = this._initPTS;
let nextAvcDts = this.nextAvcDts;
let offset = 8;
let mp4SampleDuration!: number;
let firstDTS;
let lastDTS;
let minPTS: number = Number.POSITIVE_INFINITY;
let maxPTS: number = Number.NEGATIVE_INFINITY;
let ptsDtsShift = 0;
let sortSamples = false;
// if parsed fragment is contiguous with last one, let's use last DTS value as reference
if (!contiguous || nextAvcDts === null) {
const pts = timeOffset * timeScale;
const cts =
inputSamples[0].pts -
PTSNormalize(inputSamples[0].dts, inputSamples[0].pts);
// if not contiguous, let's use target timeOffset
nextAvcDts = pts - cts;
}
// PTS is coded on 33bits, and can loop from -2^32 to 2^32
// PTSNormalize will make PTS/DTS value monotonic, we use last known DTS value as reference value
for (let i = 0; i < nbSamples; i++) {
const sample = inputSamples[i];
sample.pts = PTSNormalize(sample.pts - initPTS, nextAvcDts);
sample.dts = PTSNormalize(sample.dts - initPTS, nextAvcDts);
if (sample.dts > sample.pts) {
const PTS_DTS_SHIFT_TOLERANCE_90KHZ = 90000 * 0.2;
ptsDtsShift = Math.max(
Math.min(ptsDtsShift, sample.pts - sample.dts),
-1 * PTS_DTS_SHIFT_TOLERANCE_90KHZ
);
}
if (sample.dts < inputSamples[i > 0 ? i - 1 : i].dts) {
sortSamples = true;
}
}
// sort video samples by DTS then PTS then demux id order
if (sortSamples) {
inputSamples.sort(function (a, b) {
const deltadts = a.dts - b.dts;
const deltapts = a.pts - b.pts;
return deltadts || deltapts;
});
}
// Get first/last DTS
firstDTS = inputSamples[0].dts;
lastDTS = inputSamples[inputSamples.length - 1].dts;
// on Safari let's signal the same sample duration for all samples
// sample duration (as expected by trun MP4 boxes), should be the delta between sample DTS
// set this constant duration as being the avg delta between consecutive DTS.
const averageSampleDuration = Math.round(
(lastDTS - firstDTS) / (nbSamples - 1)
);
// handle broken streams with PTS < DTS, tolerance up 0.2 seconds
if (ptsDtsShift < 0) {
if (ptsDtsShift < averageSampleDuration * -2) {
// Fix for "CNN special report, with CC" in test-streams (including Safari browser)
// With large PTS < DTS errors such as this, we want to correct CTS while maintaining increasing DTS values
logger.warn(
`PTS < DTS detected in video samples, offsetting DTS from PTS by ${toMsFromMpegTsClock(
-averageSampleDuration,
true
)} ms`
);
let lastDts = ptsDtsShift;
for (let i = 0; i < nbSamples; i++) {
inputSamples[i].dts = lastDts = Math.max(
lastDts,
inputSamples[i].pts - averageSampleDuration
);
inputSamples[i].pts = Math.max(lastDts, inputSamples[i].pts);
}
} else {
// Fix for "Custom IV with bad PTS DTS" in test-streams
// With smaller PTS < DTS errors we can simply move all DTS back. This increases CTS without causing buffer gaps or decode errors in Safari
logger.warn(
`PTS < DTS detected in video samples, shifting DTS by ${toMsFromMpegTsClock(
ptsDtsShift,
true
)} ms to overcome this issue`
);
for (let i = 0; i < nbSamples; i++) {
inputSamples[i].dts = inputSamples[i].dts + ptsDtsShift;
}
}
firstDTS = inputSamples[0].dts;
lastDTS = inputSamples[nbSamples - 1].dts;
}
// if fragment are contiguous, detect hole/overlapping between fragments
if (contiguous) {
// check timestamp continuity across consecutive fragments (this is to remove inter-fragment gap/hole)
const delta = firstDTS - nextAvcDts;
const foundHole = delta > averageSampleDuration;
const foundOverlap = delta < -1;
if (foundHole || foundOverlap) {
if (foundHole) {
logger.warn(
`AVC: ${toMsFromMpegTsClock(
delta,
true
)} ms (${delta}dts) hole between fragments detected, filling it`
);
} else {
logger.warn(
`AVC: ${toMsFromMpegTsClock(
-delta,
true
)} ms (${delta}dts) overlapping between fragments detected`
);
}
firstDTS = nextAvcDts;
const firstPTS = inputSamples[0].pts - delta;
inputSamples[0].dts = firstDTS;
inputSamples[0].pts = firstPTS;
logger.log(
`Video: First PTS/DTS adjusted: ${toMsFromMpegTsClock(
firstPTS,
true
)}/${toMsFromMpegTsClock(
firstDTS,
true
)}, delta: ${toMsFromMpegTsClock(delta, true)} ms`
);
}
}
if (requiresPositiveDts) {
firstDTS = Math.max(0, firstDTS);
}
let nbNalu = 0;
let naluLen = 0;
for (let i = 0; i < nbSamples; i++) {
// compute total/avc sample length and nb of NAL units
const sample = inputSamples[i];
const units = sample.units;
const nbUnits = units.length;
let sampleLen = 0;
for (let j = 0; j < nbUnits; j++) {
sampleLen += units[j].data.length;
}
naluLen += sampleLen;
nbNalu += nbUnits;
sample.length = sampleLen;
// normalize PTS/DTS
// ensure sample monotonic DTS
sample.dts = Math.max(sample.dts, firstDTS);
// ensure that computed value is greater or equal than sample DTS
sample.pts = Math.max(sample.pts, sample.dts, 0);
minPTS = Math.min(sample.pts, minPTS);
maxPTS = Math.max(sample.pts, maxPTS);
}
lastDTS = inputSamples[nbSamples - 1].dts;
/* concatenate the video data and construct the mdat in place
(need 8 more bytes to fill length and mpdat type) */
const mdatSize = naluLen + 4 * nbNalu + 8;
let mdat;
try {
mdat = new Uint8Array(mdatSize);
} catch (err) {
this.observer.emit(Events.ERROR, Events.ERROR, {
type: ErrorTypes.MUX_ERROR,
details: ErrorDetails.REMUX_ALLOC_ERROR,
fatal: false,
bytes: mdatSize,
reason: `fail allocating video mdat ${mdatSize}`,
});
return;
}
const view = new DataView(mdat.buffer);
view.setUint32(0, mdatSize);
mdat.set(MP4.types.mdat, 4);
for (let i = 0; i < nbSamples; i++) {
const avcSample = inputSamples[i];
const avcSampleUnits = avcSample.units;
let mp4SampleLength = 0;
// convert NALU bitstream to MP4 format (prepend NALU with size field)
for (let j = 0, nbUnits = avcSampleUnits.length; j < nbUnits; j++) {
const unit = avcSampleUnits[j];
const unitData = unit.data;
const unitDataLen = unit.data.byteLength;
view.setUint32(offset, unitDataLen);
offset += 4;
mdat.set(unitData, offset);
offset += unitDataLen;
mp4SampleLength += 4 + unitDataLen;
}
// expected sample duration is the Decoding Timestamp diff of consecutive samples
if (i < nbSamples - 1) {
mp4SampleDuration = inputSamples[i + 1].dts - avcSample.dts;
} else {
const config = this.config;
const lastFrameDuration =
avcSample.dts - inputSamples[i > 0 ? i - 1 : i].dts;
if (config.stretchShortVideoTrack && this.nextAudioPts !== null) {
// In some cases, a segment's audio track duration may exceed the video track duration.
// Since we've already remuxed audio, and we know how long the audio track is, we look to
// see if the delta to the next segment is longer than maxBufferHole.
// If so, playback would potentially get stuck, so we artificially inflate
// the duration of the last frame to minimize any potential gap between segments.
const gapTolerance = Math.floor(config.maxBufferHole * timeScale);
const deltaToFrameEnd =
(audioTrackLength
? minPTS + audioTrackLength * timeScale
: this.nextAudioPts) - avcSample.pts;
if (deltaToFrameEnd > gapTolerance) {
// We subtract lastFrameDuration from deltaToFrameEnd to try to prevent any video
// frame overlap. maxBufferHole should be >> lastFrameDuration anyway.
mp4SampleDuration = deltaToFrameEnd - lastFrameDuration;
if (mp4SampleDuration < 0) {
mp4SampleDuration = lastFrameDuration;
}
logger.log(
`[mp4-remuxer]: It is approximately ${
deltaToFrameEnd / 90
} ms to the next segment; using duration ${
mp4SampleDuration / 90
} ms for the last video frame.`
);
} else {
mp4SampleDuration = lastFrameDuration;
}
} else {
mp4SampleDuration = lastFrameDuration;
}
}
const compositionTimeOffset = Math.round(avcSample.pts - avcSample.dts);
outputSamples.push(
new Mp4Sample(
avcSample.key,
mp4SampleDuration,
mp4SampleLength,
compositionTimeOffset
)
);
}
if (outputSamples.length && chromeVersion && chromeVersion < 70) {
// Chrome workaround, mark first sample as being a Random Access Point (keyframe) to avoid sourcebuffer append issue
// https://code.google.com/p/chromium/issues/detail?id=229412
const flags = outputSamples[0].flags;
flags.dependsOn = 2;
flags.isNonSync = 0;
}
console.assert(
mp4SampleDuration !== undefined,
'mp4SampleDuration must be computed'
);
// next AVC sample DTS should be equal to last sample DTS + last sample duration (in PES timescale)
this.nextAvcDts = nextAvcDts = lastDTS + mp4SampleDuration;
this.isVideoContiguous = true;
const moof = MP4.moof(
track.sequenceNumber++,
firstDTS,
Object.assign({}, track, {
samples: outputSamples,
})
);
const type: SourceBufferName = 'video';
const data = {
data1: moof,
data2: mdat,
startPTS: minPTS / timeScale,
endPTS: (maxPTS + mp4SampleDuration) / timeScale,
startDTS: firstDTS / timeScale,
endDTS: (nextAvcDts as number) / timeScale,
type,
hasAudio: false,
hasVideo: true,
nb: outputSamples.length,
dropped: track.dropped,
};
track.samples = [];
track.dropped = 0;
console.assert(mdat.length, 'MDAT length must not be zero');
return data;
}
remuxAudio(
track: DemuxedAudioTrack,
timeOffset: number,
contiguous: boolean,
accurateTimeOffset: boolean,
videoTimeOffset?: number
): RemuxedTrack | undefined {
const inputTimeScale: number = track.inputTimeScale;
const mp4timeScale: number = track.samplerate
? track.samplerate
: inputTimeScale;
const scaleFactor: number = inputTimeScale / mp4timeScale;
const mp4SampleDuration: number = track.isAAC
? AAC_SAMPLES_PER_FRAME
: MPEG_AUDIO_SAMPLE_PER_FRAME;
const inputSampleDuration: number = mp4SampleDuration * scaleFactor;
const initPTS: number = this._initPTS;
const rawMPEG: boolean = !track.isAAC && this.typeSupported.mpeg;
const outputSamples: Array<Mp4Sample> = [];
let inputSamples: Array<AudioSample> = track.samples;
let offset: number = rawMPEG ? 0 : 8;
let fillFrame: any;
let nextAudioPts: number = this.nextAudioPts || -1;
// window.audioSamples ? window.audioSamples.push(inputSamples.map(s => s.pts)) : (window.audioSamples = [inputSamples.map(s => s.pts)]);
// for audio samples, also consider consecutive fragments as being contiguous (even if a level switch occurs),
// for sake of clarity:
// consecutive fragments are frags with
// - less than 100ms gaps between new time offset (if accurate) and next expected PTS OR
// - less than 20 audio frames distance
// contiguous fragments are consecutive fragments from same quality level (same level, new SN = old SN + 1)
// this helps ensuring audio continuity
// and this also avoids audio glitches/cut when switching quality, or reporting wrong duration on first audio frame
const timeOffsetMpegTS = timeOffset * inputTimeScale;
this.isAudioContiguous = contiguous =
contiguous ||
((inputSamples.length &&
nextAudioPts > 0 &&
((accurateTimeOffset &&
Math.abs(timeOffsetMpegTS - nextAudioPts) < 9000) ||
Math.abs(
PTSNormalize(inputSamples[0].pts - initPTS, timeOffsetMpegTS) -
nextAudioPts
) <
20 * inputSampleDuration)) as boolean);
// compute normalized PTS
inputSamples.forEach(function (sample) {
sample.pts = sample.dts = PTSNormalize(
sample.pts - initPTS,
timeOffsetMpegTS
);
});
if (!contiguous || nextAudioPts < 0) {
// filter out sample with negative PTS that are not playable anyway
// if we don't remove these negative samples, they will shift all audio samples forward.
// leading to audio overlap between current / next fragment
inputSamples = inputSamples.filter((sample) => sample.pts >= 0);
// in case all samples have negative PTS, and have been filtered out, return now
if (!inputSamples.length) {
return;
}
if (videoTimeOffset === 0) {
// Set the start to 0 to match video so that start gaps larger than inputSampleDuration are filled with silence
nextAudioPts = 0;
} else if (accurateTimeOffset) {
// When not seeking, not live, and LevelDetails.PTSKnown, use fragment start as predicted next audio PTS
nextAudioPts = Math.max(0, timeOffsetMpegTS);
} else {
// if frags are not contiguous and if we cant trust time offset, let's use first sample PTS as next audio PTS
nextAudioPts = inputSamples[0].pts;
}
}
// If the audio track is missing samples, the frames seem to get "left-shifted" within the
// resulting mp4 segment, causing sync issues and leaving gaps at the end of the audio segment.
// In an effort to prevent this from happening, we inject frames here where there are gaps.
// When possible, we inject a silent frame; when that's not possible, we duplicate the last
// frame.
if (track.isAAC) {
const maxAudioFramesDrift = this.config.maxAudioFramesDrift;
for (let i = 0, nextPts = nextAudioPts; i < inputSamples.length; ) {
// First, let's see how far off this frame is from where we expect it to be
const sample = inputSamples[i];
const pts = sample.pts;
const delta = pts - nextPts;
const duration = Math.abs((1000 * delta) / inputTimeScale);
// When remuxing with video, if we're overlapping by more than a duration, drop this sample to stay in sync
if (
delta <= -maxAudioFramesDrift * inputSampleDuration &&
videoTimeOffset !== undefined
) {
if (contiguous || i > 0) {
logger.warn(
`[mp4-remuxer]: Dropping 1 audio frame @ ${(
nextPts / inputTimeScale
).toFixed(3)}s due to ${Math.round(duration)} ms overlap.`
);
inputSamples.splice(i, 1);
// Don't touch nextPtsNorm or i
} else {
// When changing qualities we can't trust that audio has been appended up to nextAudioPts
// Warn about the overlap but do not drop samples as that can introduce buffer gaps
logger.warn(
`Audio frame @ ${(pts / inputTimeScale).toFixed(
3
)}s overlaps nextAudioPts by ${Math.round(
(1000 * delta) / inputTimeScale
)} ms.`
);
nextPts = pts + inputSampleDuration;
i++;
}
} // eslint-disable-line brace-style
// Insert missing frames if:
// 1: We're more than maxAudioFramesDrift frame away
// 2: Not more than MAX_SILENT_FRAME_DURATION away
// 3: currentTime (aka nextPtsNorm) is not 0
// 4: remuxing with video (videoTimeOffset !== undefined)
else if (
delta >= maxAudioFramesDrift * inputSampleDuration &&
duration < MAX_SILENT_FRAME_DURATION &&
videoTimeOffset !== undefined
) {
const missing = Math.floor(delta / inputSampleDuration);
// Adjust nextPts so that silent samples are aligned with media pts. This will prevent media samples from
// later being shifted if nextPts is based on timeOffset and delta is not a multiple of inputSampleDuration.
nextPts = pts - missing * inputSampleDuration;
logger.warn(
`[mp4-remuxer]: Injecting ${missing} audio frame @ ${(
nextPts / inputTimeScale
).toFixed(3)}s due to ${Math.round(
(1000 * delta) / inputTimeScale
)} ms gap.`
);
for (let j = 0; j < missing; j++) {
const newStamp = Math.max(nextPts as number, 0);
fillFrame = AAC.getSilentFrame(
track.manifestCodec || track.codec,
track.channelCount
);
if (!fillFrame) {
logger.log(
'[mp4-remuxer]: Unable to get silent frame for given audio codec; duplicating last frame instead.'
);
fillFrame = sample.unit.subarray();
}
inputSamples.splice(i, 0, {
unit: fillFrame,
pts: newStamp,
dts: newStamp,
});
nextPts += inputSampleDuration;
i++;
}
// Adjust sample to next expected pts
sample.pts = sample.dts = nextPts;
nextPts += inputSampleDuration;
i++;
} else {
// Otherwise, just adjust pts
sample.pts = sample.dts = nextPts;
nextPts += inputSampleDuration;
i++;
}
}
}
let firstPTS: number | null = null;
let lastPTS: number | null = null;
let mdat: any;
let mdatSize: number = 0;
let sampleLength: number = inputSamples.length;
while (sampleLength--) {
mdatSize += inputSamples[sampleLength].unit.byteLength;
}
for (let j = 0, nbSamples = inputSamples.length; j < nbSamples; j++) {
const audioSample = inputSamples[j];
const unit = audioSample.unit;
let pts = audioSample.pts;
if (lastPTS !== null) {
// If we have more than one sample, set the duration of the sample to the "real" duration; the PTS diff with
// the previous sample
const prevSample = outputSamples[j - 1];
prevSample.duration = Math.round((pts - lastPTS) / scaleFactor);
} else {
const delta = Math.round(
(1000 * (pts - nextAudioPts)) / inputTimeScale
);
let numMissingFrames = 0;
// if fragment are contiguous, detect hole/overlapping between fragments
// contiguous fragments are consecutive fragments from same quality level (same level, new SN = old SN + 1)
if (contiguous && track.isAAC) {
if (delta > 0 && delta < MAX_SILENT_FRAME_DURATION) {
numMissingFrames = Math.round(
(pts - nextAudioPts) / inputSampleDuration
);
logger.log(
`[mp4-remuxer]: ${delta} ms hole between AAC samples detected,filling it`
);
if (numMissingFrames > 0) {
fillFrame = AAC.getSilentFrame(
track.manifestCodec || track.codec,
track.channelCount
);
if (!fillFrame) {
fillFrame = unit.subarray();
}
mdatSize += numMissingFrames * fillFrame.length;
}
// if we have frame overlap, overlapping for more than half a frame duraion
} else if (delta < -12) {
// drop overlapping audio frames... browser will deal with it
logger.log(
`[mp4-remuxer]: drop overlapping AAC sample, expected/parsed/delta:${(
nextAudioPts / inputTimeScale
).toFixed(3)}s/${(pts / inputTimeScale).toFixed(3)}s/${-delta}ms`
);
mdatSize -= unit.byteLength;
continue;
}
// set PTS/DTS to expected PTS/DTS
pts = nextAudioPts;
}
// remember first PTS of our audioSamples
firstPTS = pts;
if (mdatSize > 0) {
/* concatenate the audio data and construct the mdat in place
(need 8 more bytes to fill length and mdat type) */
mdatSize += offset;
try {
mdat = new Uint8Array(mdatSize);
} catch (err) {
this.observer.emit(Events.ERROR, Events.ERROR, {
type: ErrorTypes.MUX_ERROR,
details: ErrorDetails.REMUX_ALLOC_ERROR,
fatal: false,
bytes: mdatSize,
reason: `fail allocating audio mdat ${mdatSize}`,
});
return;
}
if (!rawMPEG) {
const view = new DataView(mdat.buffer);
view.setUint32(0, mdatSize);
mdat.set(MP4.types.mdat, 4);
}
} else {
// no audio samples
return;
}
for (let i = 0; i < numMissingFrames; i++) {
fillFrame = AAC.getSilentFrame(
track.manifestCodec || track.codec,
track.channelCount
);
if (!fillFrame) {
logger.log(
'[mp4-remuxer]: Unable to get silent frame for given audio codec; duplicating the current frame instead'
);
fillFrame = unit.subarray();
}
mdat.set(fillFrame, offset);
offset += fillFrame.byteLength;
outputSamples.push(
new Mp4Sample(true, AAC_SAMPLES_PER_FRAME, fillFrame.byteLength, 0)
);
}
}
mdat.set(unit, offset);
const unitLen = unit.byteLength;
offset += unitLen;
// Default the sample's duration to the computed mp4SampleDuration, which will either be 1024 for AAC or 1152 for MPEG
// In the case that we have 1 sample, this will be the duration. If we have more than one sample, the duration
// becomes the PTS diff with the previous sample
outputSamples.push(new Mp4Sample(true, mp4SampleDuration, unitLen, 0));
lastPTS = pts;
}
// We could end up with no audio samples if all input samples were overlapping with the previously remuxed ones
const nbSamples = outputSamples.length;
if (!nbSamples) {
return;
}
// The next audio sample PTS should be equal to last sample PTS + duration
const lastSample = outputSamples[outputSamples.length - 1];
this.nextAudioPts = nextAudioPts =
lastPTS! + scaleFactor * lastSample.duration;
// Set the track samples from inputSamples to outputSamples before remuxing
const moof = rawMPEG
? new Uint8Array(0)
: MP4.moof(
track.sequenceNumber++,
firstPTS! / scaleFactor,
Object.assign({}, track, { samples: outputSamples })
);
// Clear the track samples. This also clears the samples array in the demuxer, since the reference is shared
track.samples = [];
const start = firstPTS! / inputTimeScale;
const end = nextAudioPts / inputTimeScale;
const type: SourceBufferName = 'audio';
const audioData = {
data1: moof,
data2: mdat,
startPTS: start,
endPTS: end,
startDTS: start,
endDTS: end,
type,
hasAudio: true,
hasVideo: false,
nb: nbSamples,
};
this.isAudioContiguous = true;
console.assert(mdat.length, 'MDAT length must not be zero');
return audioData;
}
remuxEmptyAudio(
track: DemuxedAudioTrack,
timeOffset: number,
contiguous: boolean,
videoData: Fragment
): RemuxedTrack | undefined {
const inputTimeScale: number = track.inputTimeScale;
const mp4timeScale: number = track.samplerate
? track.samplerate
: inputTimeScale;
const scaleFactor: number = inputTimeScale / mp4timeScale;
const nextAudioPts: number | null = this.nextAudioPts;
// sync with video's timestamp
const startDTS: number =
(nextAudioPts !== null
? nextAudioPts
: videoData.startDTS * inputTimeScale) + this._initDTS;
const endDTS: number = videoData.endDTS * inputTimeScale + this._initDTS;
// one sample's duration value
const frameDuration: number = scaleFactor * AAC_SAMPLES_PER_FRAME;
// samples count of this segment's duration
const nbSamples: number = Math.ceil((endDTS - startDTS) / frameDuration);
// silent frame
const silentFrame: Uint8Array | undefined = AAC.getSilentFrame(
track.manifestCodec || track.codec,
track.channelCount
);
logger.warn('[mp4-remuxer]: remux empty Audio');
// Can't remux if we can't generate a silent frame...
if (!silentFrame) {
logger.trace(
'[mp4-remuxer]: Unable to remuxEmptyAudio since we were unable to get a silent frame for given audio codec'
);
return;
}
const samples: Array<any> = [];
for (let i = 0; i < nbSamples; i++) {
const stamp = startDTS + i * frameDuration;
samples.push({ unit: silentFrame, pts: stamp, dts: stamp });
}
track.samples = samples;
return this.remuxAudio(track, timeOffset, contiguous, false);
}
remuxID3(
track: DemuxedMetadataTrack,
timeOffset: number
): RemuxedMetadata | undefined {
const length = track.samples.length;
if (!length) {
return;
}
const inputTimeScale = track.inputTimeScale;
const initPTS = this._initPTS;
const initDTS = this._initDTS;
for (let index = 0; index < length; index++) {
const sample = track.samples[index];
// setting id3 pts, dts to relative time
// using this._initPTS and this._initDTS to calculate relative time
sample.pts =
PTSNormalize(sample.pts - initPTS, timeOffset * inputTimeScale) /
inputTimeScale;
sample.dts =
PTSNormalize(sample.dts - initDTS, timeOffset * inputTimeScale) /
inputTimeScale;
}
const samples = track.samples;
track.samples = [];
return {
samples,
};
}
remuxText(
track: DemuxedUserdataTrack,
timeOffset: number
): RemuxedUserdata | undefined {
const length = track.samples.length;
if (!length) {
return;
}
const inputTimeScale = track.inputTimeScale;
const initPTS = this._initPTS;
for (let index = 0; index < length; index++) {
const sample = track.samples[index];
// setting text pts, dts to relative time
// using this._initPTS and this._initDTS to calculate relative time
sample.pts =
PTSNormalize(sample.pts - initPTS, timeOffset * inputTimeScale) /
inputTimeScale;
}
track.samples.sort((a, b) => a.pts - b.pts);
const samples = track.samples;
track.samples = [];
return {
samples,
};
}
}
export function PTSNormalize(value: number, reference: number | null): number {
let offset;
if (reference === null) {
return value;
}
if (reference < value) {
// - 2^33
offset = -8589934592;
} else {
// + 2^33
offset = 8589934592;
}
/* PTS is 33bit (from 0 to 2^33 -1)
if diff between value and reference is bigger than half of the amplitude (2^32) then it means that
PTS looping occured. fill the gap */
while (Math.abs(value - reference) > 4294967296) {
value += offset;
}
return value;
}
function findKeyframeIndex(samples: Array<AvcSample>): number {
for (let i = 0; i < samples.length; i++) {
if (samples[i].key) {
return i;
}
}
return -1;
}
class Mp4Sample {
public size: number;
public duration: number;
public cts: number;
public flags: Mp4SampleFlags;
constructor(isKeyframe: boolean, duration, size, cts) {
this.duration = duration;
this.size = size;
this.cts = cts;
this.flags = new Mp4SampleFlags(isKeyframe);
}
}
class Mp4SampleFlags {
public isLeading: 0 = 0;
public isDependedOn: 0 = 0;
public hasRedundancy: 0 = 0;
public degradPrio: 0 = 0;
public dependsOn: 1 | 2 = 1;
public isNonSync: 0 | 1 = 1;
constructor(isKeyframe) {
this.dependsOn = isKeyframe ? 2 : 1;
this.isNonSync = isKeyframe ? 0 : 1;
}
}