[client] audio: adjust playback speed to match audio device clock

This change is based on the techniques described in [1] and [2].

The input audio stream from Spice is not synchronised to the audio playback
device. While the input and output may be both nominally running at 48 kHz,
when compared against each other, they will differ by a tiny fraction of a
percent. Given enough time (typically on the order of a few hours), this
will result in the ring buffer becoming completely full or completely
empty. It will stay in this state permanently, periodically resulting in
glitches as the buffer repeatedly underruns or overruns.

To address this, adjust the speed of the received data to match the rate at
which it is being consumed by the audio device. This will result in a
slight pitch shift, but the changes should be small and smooth enough that
this is unnoticeable to the user.

The process works roughly as follows:
1. Every time audio data is received from Spice, or consumed by the audio
   device, sample the current time. These are fed into a pair of delay
   locked loops to produce smoothed approximations of the two clocks.
2. Compute the difference between the two clocks and compare this against
   the target latency to produce an error value. This error value will be
   quite stable during normal operation, but can change quite rapidly due
   to external factors, particularly at the start of playback. To smooth
   out any sudden changes in playback speed, which would be noticeable to
   the user, this value is also filtered through another delay locked loop.
3. Feed this error value into a PI controller to produce a ratio value.
   This is the target playback speed in order to bring the error value
   towards zero.
4. Resample the input audio using the computed ratio to apply the speed
   change. The output of the resampler is what is ultimately inserted into
   the ring buffer for consumption by the audio device.

Since this process targets a specific latency value, rather than simply
trying to rate match the input and output, it also has the effect of
'correcting' latency issues. If a high latency application (such as a media
player) is already running, the time between requesting the start of
playback and the audio device actually starting to consume samples can be
very high, easily in the hundreds of milliseconds. The changes here will
automatically adjust the playback speed over the course of a few minutes to
bring the latency back down to the target value.

[1] https://kokkinizita.linuxaudio.org/papers/adapt-resamp.pdf
[2] https://kokkinizita.linuxaudio.org/papers/usingdll.pdf
This commit is contained in:
Chris Spencer 2022-01-26 20:55:24 +00:00 committed by Geoffrey McRae
parent 599fdd6ffd
commit dd2d84a080
5 changed files with 318 additions and 18 deletions

View file

@ -26,6 +26,7 @@ include(UninstallTarget)
find_package(PkgConfig)
pkg_check_modules(FONTCONFIG REQUIRED IMPORTED_TARGET fontconfig)
pkg_check_modules(SAMPLERATE REQUIRED IMPORTED_TARGET samplerate)
option(ENABLE_OPENGL "Enable the OpenGL renderer" ON)
add_feature_info(ENABLE_OPENGL ENABLE_OPENGL "Legacy OpenGL renderer.")
@ -160,6 +161,7 @@ target_compile_definitions(looking-glass-client PRIVATE CIMGUI_DEFINE_ENUMS_AND_
target_link_libraries(looking-glass-client
${EXE_FLAGS}
PkgConfig::FONTCONFIG
PkgConfig::SAMPLERATE
lg_common
displayservers
lgmp

View file

@ -207,7 +207,7 @@ static void pipewire_playbackSetup(int channels, int sampleRate,
pw.playback.channels = channels;
pw.playback.sampleRate = sampleRate;
pw.playback.stride = sizeof(uint16_t) * channels;
pw.playback.stride = sizeof(float) * channels;
pw.playback.pullFn = pullFn;
pw.playback.startFrames = maxLatencyFrames;
@ -236,7 +236,7 @@ static void pipewire_playbackSetup(int channels, int sampleRate,
params[0] = spa_format_audio_raw_build(&b, SPA_PARAM_EnumFormat,
&SPA_AUDIO_INFO_RAW_INIT(
.format = SPA_AUDIO_FORMAT_S16,
.format = SPA_AUDIO_FORMAT_F32,
.channels = channels,
.rate = sampleRate
));

View file

@ -220,6 +220,11 @@ static void pulseaudio_free(void)
static void pulseaudio_write_cb(pa_stream * p, size_t nbytes, void * userdata)
{
// PulseAudio tries to pull data from the stream as soon as it is created for
// some reason, even though it is corked
if (pa.sinkCorked)
return;
uint8_t * dst;
pa_stream_begin_write(p, (void **)&dst, &nbytes);
@ -250,7 +255,7 @@ static void pulseaudio_setup(int channels, int sampleRate,
const int PERIOD_LEN = 80;
pa_sample_spec spec = {
.format = PA_SAMPLE_S16LE,
.format = PA_SAMPLE_FLOAT32,
.rate = sampleRate,
.channels = channels
};
@ -279,7 +284,7 @@ static void pulseaudio_setup(int channels, int sampleRate,
PA_STREAM_START_CORKED | PA_STREAM_ADJUST_LATENCY,
NULL, NULL);
pa.sinkStride = channels * sizeof(uint16_t);
pa.sinkStride = channels * sizeof(float);
pa.sinkPullFn = pullFn;
pa.sinkStart = attribs.tlength / pa.sinkStride;
pa.sinkCorked = true;

View file

@ -45,7 +45,7 @@ struct LG_AudioDevOps
struct
{
/* setup the stream for playback but don't start it yet
* Note: currently SPICE only supports S16 samples so always assume so
* Note: the pull function returns f32 samples
*/
void (*setup)(int channels, int sampleRate, LG_AudioPullFn pullFn);

View file

@ -26,6 +26,9 @@
#include "dynamic/audiodev.h"
#include <math.h>
#include <samplerate.h>
#include <stdalign.h>
#include <string.h>
typedef enum
@ -40,6 +43,45 @@ StreamState;
#define STREAM_ACTIVE(state) \
(state == STREAM_STATE_SETUP || state == STREAM_STATE_RUN)
typedef struct
{
int periodFrames;
double periodSec;
int64_t nextTime;
int64_t nextPosition;
double b;
double c;
}
PlaybackDeviceData;
typedef struct
{
float * framesIn;
float * framesOut;
int framesOutSize;
int periodFrames;
double periodSec;
int64_t nextTime;
int64_t nextPosition;
double b;
double c;
int64_t devLastTime;
int64_t devNextTime;
int64_t devLastPosition;
int64_t devNextPosition;
double offsetError;
double offsetErrorIntegral;
double ratioIntegral;
SRC_STATE * src;
}
PlaybackSpiceData;
typedef struct
{
struct LG_AudioDevOps * audioDev;
@ -50,13 +92,21 @@ typedef struct
int volumeChannels;
uint16_t volume[8];
bool mute;
int channels;
int sampleRate;
int stride;
RingBuffer buffer;
RingBuffer deviceTiming;
LG_Lock lock;
RingBuffer timings;
GraphHandle graph;
// These two structs contain data specifically for use in the device and
// Spice data threads respectively. Keep them on separate cache lines to
// avoid false sharing
alignas(64) PlaybackDeviceData deviceData;
alignas(64) PlaybackSpiceData spiceData;
}
playback;
@ -75,6 +125,13 @@ AudioState;
static AudioState audio = { 0 };
typedef struct
{
int64_t nextTime;
int64_t nextPosition;
}
PlaybackDeviceTick;
static void playbackStopNL(void);
void audio_init(void)
@ -132,6 +189,16 @@ static void playbackStopNL(void)
audio.playback.state = STREAM_STATE_STOP;
audio.audioDev->playback.stop();
ringbuffer_free(&audio.playback.buffer);
ringbuffer_free(&audio.playback.deviceTiming);
audio.playback.spiceData.src = src_delete(audio.playback.spiceData.src);
if (audio.playback.spiceData.framesIn)
{
free(audio.playback.spiceData.framesIn);
free(audio.playback.spiceData.framesOut);
audio.playback.spiceData.framesIn = NULL;
audio.playback.spiceData.framesOut = NULL;
}
if (audio.playback.timings)
{
@ -142,13 +209,68 @@ static void playbackStopNL(void)
static int playbackPullFrames(uint8_t * dst, int frames)
{
DEBUG_ASSERT(frames >= 0);
if (frames == 0)
return frames;
PlaybackDeviceData * data = &audio.playback.deviceData;
int64_t now = nanotime();
if (audio.playback.buffer)
frames = ringbuffer_consume(audio.playback.buffer, dst, frames);
{
// Measure the device clock and post to the Spice thread
if (frames != data->periodFrames)
{
bool init = data->periodFrames == 0;
if (init)
data->nextTime = now;
data->periodFrames = frames;
data->periodSec = (double) frames / audio.playback.sampleRate;
data->nextTime += llrint(data->periodSec * 1.0e9);
data->nextPosition += frames;
double bandwidth = 0.05;
double omega = 2.0 * M_PI * bandwidth * data->periodSec;
data->b = M_SQRT2 * omega;
data->c = omega * omega;
}
else
{
double error = (now - data->nextTime) * 1.0e-9;
if (fabs(error) >= 0.2)
{
// Clock error is too high; slew the read pointer and reset the timing
// parameters to avoid getting too far out of sync
int slewFrames = round(error * audio.playback.sampleRate);
ringbuffer_consume(audio.playback.buffer, NULL, slewFrames);
data->periodSec = (double) frames / audio.playback.sampleRate;
data->nextTime = now + llrint(data->periodSec * 1.0e9);
data->nextPosition += slewFrames + frames;
}
else
{
data->nextTime +=
llrint((data->b * error + data->periodSec) * 1.0e9);
data->periodSec += data->c * error;
data->nextPosition += frames;
}
}
PlaybackDeviceTick tick = {
.nextTime = data->nextTime,
.nextPosition = data->nextPosition
};
ringbuffer_append(audio.playback.deviceTiming, &tick, 1);
ringbuffer_consume(audio.playback.buffer, dst, frames);
}
else
frames = 0;
if (audio.playback.state == STREAM_STATE_DRAIN &&
ringbuffer_getCount(audio.playback.buffer) == 0)
ringbuffer_getCount(audio.playback.buffer) <= 0)
{
LG_LOCK(audio.playback.lock);
playbackStopNL();
@ -172,18 +294,37 @@ void audio_playbackStart(int channels, int sampleRate, PSAudioFormat format,
playbackStopNL();
}
// Using a bounded ring buffer for now. We are not currently doing anything to
// keep the input and output in sync, so if we were using an unbounded buffer,
// it would eventually end up in a state of permanent underrun or overrun and
// the user would only hear silence
const int bufferFrames = sampleRate;
audio.playback.buffer = ringbuffer_new(bufferFrames,
channels * sizeof(uint16_t));
int srcError;
audio.playback.spiceData.src =
src_new(SRC_SINC_BEST_QUALITY, channels, &srcError);
if (!audio.playback.spiceData.src)
{
DEBUG_ERROR("Failed to create resampler: %s", src_strerror(srcError));
goto done;
}
const int bufferFrames = sampleRate;
audio.playback.buffer = ringbuffer_newUnbounded(bufferFrames,
channels * sizeof(float));
audio.playback.deviceTiming = ringbuffer_new(16, sizeof(PlaybackDeviceTick));
audio.playback.channels = channels;
audio.playback.sampleRate = sampleRate;
audio.playback.stride = channels * sizeof(uint16_t);
audio.playback.stride = channels * sizeof(float);
audio.playback.state = STREAM_STATE_SETUP;
audio.playback.deviceData.periodFrames = 0;
audio.playback.deviceData.nextPosition = 0;
audio.playback.spiceData.periodFrames = 0;
audio.playback.spiceData.nextPosition = 0;
audio.playback.spiceData.devLastTime = INT64_MIN;
audio.playback.spiceData.devNextTime = INT64_MIN;
audio.playback.spiceData.offsetError = 0.0;
audio.playback.spiceData.offsetErrorIntegral = 0.0;
audio.playback.spiceData.ratioIntegral = 0.0;
audio.audioDev->playback.setup(channels, sampleRate, playbackPullFrames);
// if a volume level was stored, set it before we return
@ -203,6 +344,7 @@ void audio_playbackStart(int channels, int sampleRate, PSAudioFormat format,
audio.playback.state = STREAM_STATE_SETUP;
done:
LG_UNLOCK(audio.playback.lock);
}
@ -246,14 +388,165 @@ void audio_playbackMute(bool mute)
void audio_playbackData(uint8_t * data, size_t size)
{
if (!audio.audioDev)
if (!audio.audioDev || size == 0)
return;
if (!STREAM_ACTIVE(audio.playback.state))
return;
int frames = size / audio.playback.stride;
ringbuffer_append(audio.playback.buffer, data, frames);
PlaybackSpiceData * spiceData = &audio.playback.spiceData;
int64_t now = nanotime();
// Convert from s16 to f32 samples
int spiceStride = audio.playback.channels * sizeof(int16_t);
int frames = size / spiceStride;
bool periodChanged = frames != spiceData->periodFrames;
bool init = spiceData->periodFrames == 0;
if (periodChanged)
{
if (spiceData->framesIn)
{
free(spiceData->framesIn);
free(spiceData->framesOut);
}
spiceData->periodFrames = frames;
spiceData->framesIn = malloc(frames * audio.playback.stride);
spiceData->framesOutSize = round(frames * 1.1);
spiceData->framesOut =
malloc(spiceData->framesOutSize * audio.playback.stride);
}
src_short_to_float_array((int16_t *) data, spiceData->framesIn,
frames * audio.playback.channels);
// Receive timing information from the audio device thread
PlaybackDeviceTick deviceTick;
while (ringbuffer_consume(audio.playback.deviceTiming, &deviceTick, 1))
{
spiceData->devLastTime = spiceData->devNextTime;
spiceData->devLastPosition = spiceData->devNextPosition;
spiceData->devNextTime = deviceTick.nextTime;
spiceData->devNextPosition = deviceTick.nextPosition;
}
// Measure the Spice audio clock
int64_t curTime;
int64_t curPosition;
if (periodChanged)
{
if (init)
spiceData->nextTime = now;
curTime = spiceData->nextTime;
curPosition = spiceData->nextPosition;
spiceData->periodSec = (double) frames / audio.playback.sampleRate;
spiceData->nextTime += llrint(spiceData->periodSec * 1.0e9);
double bandwidth = 0.05;
double omega = 2.0 * M_PI * bandwidth * spiceData->periodSec;
spiceData->b = M_SQRT2 * omega;
spiceData->c = omega * omega;
}
else
{
double error = (now - spiceData->nextTime) * 1.0e-9;
if (fabs(error) >= 0.2)
{
// Clock error is too high; slew the write pointer and reset the timing
// parameters to avoid getting too far out of sync
int slewFrames = round(error * audio.playback.sampleRate);
ringbuffer_append(audio.playback.buffer, NULL, slewFrames);
curTime = now;
curPosition = spiceData->nextPosition + slewFrames;
spiceData->periodSec = (double) frames / audio.playback.sampleRate;
spiceData->nextTime = now + llrint(spiceData->periodSec * 1.0e9);
spiceData->nextPosition = curPosition;
}
else
{
curTime = spiceData->nextTime;
curPosition = spiceData->nextPosition;
spiceData->nextTime +=
llrint((spiceData->b * error + spiceData->periodSec) * 1.0e9);
spiceData->periodSec += spiceData->c * error;
}
}
// Measure the offset between the Spice position and the device position,
// and how far away this is from the target latency. We use this to adjust
// the playback speed to bring them back in line. This value can change
// quite rapidly, particularly at the start of playback, so filter it to
// avoid sudden pitch shifts which will be noticeable to the user.
double offsetError = spiceData->offsetError;
if (spiceData->devLastTime != INT64_MIN)
{
// Interpolate to calculate the current device position
double devPosition = spiceData->devLastPosition +
(spiceData->devNextPosition - spiceData->devLastPosition) *
((double) (curTime - spiceData->devLastTime) /
(spiceData->devNextTime - spiceData->devLastTime));
// Target latency derived experimentally to avoid underruns. This could be
// reduced with more tuning. We could adjust on the fly based upon the
// device period size, but that would result in underruns if the period size
// suddenly increases. It may be better instead to just reduce the maximum
// latency on the audio devices, which currently is set quite high
int targetLatencyMs = 70;
int targetLatencyFrames =
targetLatencyMs * audio.playback.sampleRate / 1000;
double actualOffset = curPosition - devPosition;
double actualOffsetError = -(actualOffset - targetLatencyFrames);
double error = actualOffsetError - offsetError;
spiceData->offsetError += spiceData->b * error +
spiceData->offsetErrorIntegral;
spiceData->offsetErrorIntegral += spiceData->c * error;
}
// Resample the audio to adjust the playback speed. Use a PI controller to
// adjust the resampling ratio based upon the measured offset
double kp = 0.5e-6;
double ki = 1.0e-16;
spiceData->ratioIntegral += offsetError * spiceData->periodSec;
double piOutput = kp * offsetError + ki * spiceData->ratioIntegral;
double ratio = 1.0 + piOutput;
int consumed = 0;
while (consumed < frames)
{
SRC_DATA srcData = {
.data_in = spiceData->framesIn + consumed * audio.playback.channels,
.data_out = spiceData->framesOut,
.input_frames = frames - consumed,
.output_frames = spiceData->framesOutSize,
.input_frames_used = 0,
.output_frames_gen = 0,
.end_of_input = 0,
.src_ratio = ratio
};
int error = src_process(spiceData->src, &srcData);
if (error)
{
DEBUG_ERROR("Resampling failed: %s", src_strerror(error));
return;
}
ringbuffer_append(audio.playback.buffer, spiceData->framesOut,
srcData.output_frames_gen);
consumed += srcData.input_frames_used;
spiceData->nextPosition += srcData.output_frames_gen;
}
if (audio.playback.state == STREAM_STATE_SETUP)
{