Posted on Feb 19

Making a performant audio capture system⚡🚀💨

Ever wanted to build a performant audio capturing system, but don't know where to start? It definitely happened to me.

Due to lack of modern C++ implementations on this, I decided to share what I've learned when making it and give you a starting group off of what I've managed to build!

DISCLAIMER: Knowledge about OOP, basic C++ principles or overall general programming knowledge and GRPC is advised!

The motivation

The motivation is for a way larger project I'm working on. The source code for all of the code is going to be linked below at the end of the post! 😊

The CMake configuration

Every modern C/C++ application requires some type of configuration, this is where I chose CMake since It's perfect for cross platform compilation!

 # Define the minimum required version # and the project version cmake_minimum_required(VERSION 3.16) project(audio_capture VERSION 1.0) # Set C standards set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) #Include additional common cmake code for working with GRPC include(./cmake/common.cmake) #Include FetchContent for fetching code and files remotely. include(FetchContent) # Find all .proto files in the protos directory file(GLOB_RECURSE PROTO_FILES "${CMAKE_SOURCE_DIR}/protos/*.proto") # Create lists to store generated files set(PROTO_SRCS) set(PROTO_HDRS) set(GRPC_SRCS) set(GRPC_HDRS) # Generate protocol buffer and gRPC code for each .proto file foreach(proto_file ${PROTO_FILES}) get_filename_component(proto_path "${proto_file}" PATH) get_filename_component(proto_name "${proto_file}" NAME_WE) list(APPEND PROTO_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${proto_name}.pb.cc") list(APPEND PROTO_HDRS "${CMAKE_CURRENT_BINARY_DIR}/${proto_name}.pb.h") list(APPEND GRPC_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${proto_name}.grpc.pb.cc") list(APPEND GRPC_HDRS "${CMAKE_CURRENT_BINARY_DIR}/${proto_name}.grpc.pb.h") add_custom_command( OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${proto_name}.pb.cc" "${CMAKE_CURRENT_BINARY_DIR}/${proto_name}.pb.h" "${CMAKE_CURRENT_BINARY_DIR}/${proto_name}.grpc.pb.cc" "${CMAKE_CURRENT_BINARY_DIR}/${proto_name}.grpc.pb.h" COMMAND ${_PROTOBUF_PROTOC} ARGS --grpc_out=generate_mock_code=false:"${CMAKE_CURRENT_BINARY_DIR}" # Disable server stub --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${proto_path}" --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}" "${proto_file}" DEPENDS "${proto_file}") endforeach() # Create protocol buffer library add_library(audio_proto ${GRPC_SRCS} ${GRPC_HDRS} ${PROTO_SRCS} ${PROTO_HDRS}) target_link_libraries(audio_proto absl::check ${_REFLECTION} ${_GRPC_GRPCPP} ${_PROTOBUF_LIBPROTOBUF}) # Configure PortAudio FetchContent_Declare( portaudio GIT_REPOSITORY https://github.com/PortAudio/portaudio.git GIT_TAG v19.7.0 ) # Configure PortAudio build options set(PA_BUILD_SHARED OFF CACHE BOOL "") set(PA_BUILD_TESTS OFF CACHE BOOL "") set(PA_BUILD_EXAMPLES OFF CACHE BOOL "") # Make PortAudio available FetchContent_MakeAvailable(portaudio) # Configure spdlog FetchContent_Declare( spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git GIT_TAG v1.15.1 ) FetchContent_MakeAvailable(spdlog) # Find FFTW3 find_package(PkgConfig REQUIRED) pkg_check_modules(FFTW3 REQUIRED fftw3) # Include directories include_directories( ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_SOURCE_DIR}/includes ${FFTW3_INCLUDE_DIRS} ${portaudio_SOURCE_DIR}/source ) # Create main executable file(GLOB_RECURSE SRC_FILES src/*.cc) add_executable(audio_capture ${SRC_FILES}) # Link dependencies target_link_libraries(audio_capture PRIVATE audio_proto portaudio_static spdlog::spdlog ${FFTW3_LIBRARIES} absl::check absl::flags absl::flags_parse absl::log ${_REFLECTION} ${_GRPC_GRPCPP} ${_PROTOBUF_LIBPROTOBUF} ) # Platform-specific configuration if(WIN32) target_link_libraries(audio_capture PRIVATE winmm) elseif(UNIX AND NOT APPLE) find_package(Threads REQUIRED) target_link_libraries(audio_capture PRIVATE pthread fftw3 ) endif() # Add FFTW3 library directories link_directories(${FFTW3_LIBRARY_DIRS})

The project architecture

The whole project architecture is quite simple, maintainable and made to scale.

|-- CMakeLists.txt |-- Makefile |-- cmake | `-- common.cmake |-- includes | |-- audio_capture.h | `-- logger.h |-- protos | `-- device_stream.proto `-- src |-- audio_capture.cc |-- logger.cc `-- main.cc

The logger

I've used the spdlog for beautiful logging messages that come with it!

Here is the .h definition (all of the source code is on github)

 #pragma once  #include <cstddef> #include <memory> #include <spdlog/sinks/basic_file_sink.h> #include <spdlog/spdlog.h> #include <string>  class Logger { public: static void init(); static std::shared_ptr<Logger> get(); static constexpr size_t queue_items_max = 8192; static constexpr size_t backing_thread_count = 1; static constexpr size_t max_file_size = 10 * 1024 * 1024; static constexpr size_t max_files = 5; static constexpr std::string logs_path = "logs/app.log"; static void info(const std::string &message) { logger_->info(message); } static void warn(const std::string &message) { logger_->warn(message); } static void error(const std::string &message) { logger_->error(message); } private: static std::shared_ptr<spdlog::logger> logger_; };

The logger implementation:

#include "logger.h" #include "spdlog/async.h" #include "spdlog/common.h" #include "spdlog/sinks/rotating_file_sink.h" #include "spdlog/sinks/stdout_color_sinks.h" #include "spdlog/spdlog.h" #include <iostream> #include <memory>  // Initialize logger as nullptr std::shared_ptr<spdlog::logger> Logger::logger_ = nullptr; void Logger::init() { try { spdlog::init_thread_pool(queue_items_max, backing_thread_count); auto console_sink = std::make_shared<spdlog::sinks::stdout_color_sink_mt>(); auto file_rotating_sink = std::make_shared<spdlog::sinks::rotating_file_sink_mt>( logs_path, max_file_size, max_files); console_sink->set_level(spdlog::level::info); file_rotating_sink->set_level(spdlog::level::debug); logger_ = std::make_shared<spdlog::async_logger>( "main_logger", spdlog::sinks_init_list{console_sink, file_rotating_sink}, spdlog::thread_pool(), spdlog::async_overflow_policy::block); spdlog::set_default_logger(logger_); spdlog::set_pattern("[%Y-%m-%d %H:%M:%S] [%^%l%$] [thread %t] %v"); #ifdef NDEBUG  spdlog::set_level(spdlog::level::info); #else  spdlog::set_level(spdlog::level::debug); #endif  spdlog::info("Logger initialized successfully!"); } catch (const spdlog::spdlog_ex &ex) { std::cerr << "Log initialization failed: " << ex.what() << std::endl; exit(EXIT_FAILURE); } } std::shared_ptr<Logger> Logger::get() { static std::shared_ptr<Logger> instance = std::make_shared<Logger>(); return instance; }

The protobuf GRPC messages & RPCs

The current implementation of this is very simple and straight forward. Both the server and the client share the same GRPC file and structure to communicate with streams.

syntax = "proto3"; package audio_stream; service AudioStream { rpc StreamAudio(stream AudioChunk) returns (StreamResponse); } message AudioChunk { bytes audio_data = 1; repeated double spectral_data = 2; double energy = 3; double zero_crossings = 4; double speech_band_energy = 5; bool voice_detected = 6; int64 timestamp = 7; } message StreamResponse { bool success = 1; string response = 2; }

The audio capture system

The audio capture system is implemented using portaudio.h which is a library used for cross platform audio development!

Here is my implementation of the .h file (header):

#pragma once  #include "portaudio.h" #include <atomic> #include <condition_variable> #include <cstddef> #include <memory> #include <mutex> #include <queue> #include <unordered_map> #include <vector> #include "device_stream.grpc.pb.h" // gRPC generated header  // Forward declaration of Logger class Logger; // AudioCapture class for handling audio streaming class AudioCapture { public: AudioCapture(); ~AudioCapture(); static constexpr size_t sample_rate = 44100; // Audio sample rate static constexpr size_t frames_per_buffer = 512; // Frames per buffer static constexpr int numInputChannels = 1; // Number of input channels (mono) static constexpr int numOutputChannels = 0; // Number of output channels (no output) // Starts the audio stream bool start(); // Stops the audio stream void stop(); // Checks if the stream is running bool isRunning() const; // Retrieves available audio devices std::unordered_map<size_t, std::shared_ptr<const PaDeviceInfo>> getDevices(); // Opens the audio stream with the specified device index void openAudioStream(int deviceIndex); // Gets the total number of audio devices int getDeviceCount() const; // Creates stream parameters for a given device and number of channels std::unique_ptr<PaStreamParameters> createStreamParameters(int device, int numChannels); // Sends the audio stream to the gRPC server void sendAudioStream(std::shared_ptr<audio_stream::AudioStream::Stub> stub); // Closes the audio stream void closeAudioStream(); // Retrieves the next audio chunk from the queue std::vector<float> getNextAudioChunk(); private: // Audio callback function for PortAudio static int audioCallback(const void *input, void *, size_t, const PaStreamCallbackTimeInfo *, PaStreamCallbackFlags, void *userData); // Queue to hold audio chunks std::queue<std::vector<float>> audioQueue; // Mutex and condition variable for thread synchronization std::mutex queueMutex; std::condition_variable queueCondition; // Atomic flag to manage the running state of the capture std::atomic<bool> running_; // Stream parameters for input and output std::unique_ptr<PaStreamParameters> inputParameters; std::unique_ptr<PaStreamParameters> outputParameters; // Logger for logging events std::shared_ptr<Logger> logger_; // Pointer to the PortAudio stream PaStream *stream_; };

The implementation of audio capture:

// AudioCapture.cpp #include "audio_capture.h" #include "device_stream.pb.h" #include "logger.h" #include "portaudio.h" #include <algorithm> #include <cstddef> #include <grpcpp/client_context.h> #include <grpcpp/support/sync_stream.h> #include <mutex> #include <optional> #include <string> #include <utility> #include <vector>  static void handleAudioError(PaError err, std::shared_ptr<Logger> logger) { if (err != paNoError) { logger->error("PortAudio error: " + std::string(Pa_GetErrorText(err))); } } AudioCapture::AudioCapture() : stream_(nullptr), running_(false), logger_(std::make_shared<Logger>()) { PaError err = Pa_Initialize(); handleAudioError(err, logger_); } AudioCapture::~AudioCapture() { PaError err = Pa_Terminate(); handleAudioError(err, logger_); } bool AudioCapture::start() { if (running_) return false; handleAudioError(Pa_OpenDefaultStream( &stream_, numInputChannels, numOutputChannels, paFloat32, sample_rate, frames_per_buffer, audioCallback, this), logger_); PaError err = Pa_StartStream(stream_); if (err != paNoError) { logger_->error("Failed to start audio stream: " + std::string(Pa_GetErrorText(err))); return false; } running_ = true; return true; } void AudioCapture::stop() { if (!running_) { return; } handleAudioError(Pa_StopStream(stream_), logger_); Pa_CloseStream(stream_); running_ = false; } bool AudioCapture::isRunning() const { return running_; } int AudioCapture::getDeviceCount() const { PaDeviceIndex deviceCount = Pa_GetDeviceCount(); return static_cast<int>(deviceCount); } std::vector<float> AudioCapture::getNextAudioChunk() { std::unique_lock<std::mutex> lock(queueMutex); queueCondition.wait(lock, [this] { return !audioQueue.empty() || !running_; }); if (!audioQueue.empty()) { std::vector<float> chunk = std::move(audioQueue.front()); audioQueue.pop(); return chunk; } return std::vector<float>(); } void AudioCapture::closeAudioStream() { if (Pa_IsStreamActive(stream_) == 1) { Pa_StopStream(stream_); } Pa_CloseStream(stream_); stream_ = nullptr; } void AudioCapture::openAudioStream(int deviceIndex) { if (stream_) { if (Pa_IsStreamActive(stream_) == 1) { handleAudioError(Pa_StopStream(stream_), logger_); } Pa_CloseStream(stream_); stream_ = nullptr; } // Reset parameters inputParameters.reset(); outputParameters.reset(); if (deviceIndex < 0 || deviceIndex >= getDeviceCount()) { logger_->error("Invalid device index"); } const PaDeviceInfo *deviceInfo = Pa_GetDeviceInfo(deviceIndex); if (!deviceIndex) { logger_->error("Failed to get device info"); return; } logger_->info("Device Info: "); logger_->info("Name: " + std::string(deviceInfo->name)); logger_->info("Max Input Channels: " + std::to_string(deviceInfo->maxInputChannels)); logger_->info("Max Output Channels: " + std::to_string(deviceInfo->maxOutputChannels)); logger_->info("Default Sample Rate: " + std::to_string(deviceInfo->defaultSampleRate)); std::optional<PaStreamParameters> inputParams; std::optional<PaStreamParameters> outputParams; if (deviceInfo->maxInputChannels > 0) { inputParameters = createStreamParameters(deviceIndex, deviceInfo->maxInputChannels); inputParams = *inputParameters; } if (deviceInfo->maxOutputChannels > 0) { outputParameters = createStreamParameters(deviceIndex, deviceInfo->maxOutputChannels); outputParams = *outputParameters; } double deviceSampleRate = deviceInfo->defaultSampleRate; handleAudioError( Pa_OpenStream(&stream_, inputParams ? &*inputParams : nullptr, outputParams ? &*outputParams : nullptr, deviceSampleRate, frames_per_buffer, paClipOff, audioCallback, this), logger_); std::string deviceName = std::string(deviceInfo->name); logger_->info("Stream successfully opened for device" + deviceName); Pa_StartStream(stream_); } std::unique_ptr<PaStreamParameters> AudioCapture::createStreamParameters(int device, int numChannels) { auto parameters = std::make_unique<PaStreamParameters>(); const PaDeviceInfo *deviceInfo = Pa_GetDeviceInfo(device); if (!deviceInfo) { logger_->error("Failed to get device information for device: " + std::to_string(device)); return nullptr; } int channels = std::min(numChannels, deviceInfo->maxInputChannels); parameters->device = device; parameters->channelCount = channels; parameters->sampleFormat = paFloat32; parameters->suggestedLatency = deviceInfo->defaultHighInputLatency; parameters->hostApiSpecificStreamInfo = nullptr; return parameters; } std::unordered_map<size_t, std::shared_ptr<const PaDeviceInfo>> AudioCapture::getDevices() { std::unordered_map<size_t, std::shared_ptr<const PaDeviceInfo>> devices; PaDeviceIndex deviceCount = Pa_GetDeviceCount(); if (deviceCount < 0) { logger_->error("Failed to retrieve PortAudio device count: " + std::to_string(deviceCount)); return devices; } for (size_t i = 0; i < static_cast<size_t>(deviceCount); i++) { const PaDeviceInfo *deviceInfo = Pa_GetDeviceInfo(i); if (deviceInfo) { devices.emplace(i, std::make_shared<const PaDeviceInfo>(*deviceInfo)); } else { logger_->warn("Warning: Failed to get device info for device " + std::to_string(i)); } } return devices; } int AudioCapture::audioCallback(const void *input, void *, size_t, const PaStreamCallbackTimeInfo *, PaStreamCallbackFlags, void *userData) { if (!input) { return paContinue; } auto *self = static_cast<AudioCapture *>(userData); std::vector<float> buffer(static_cast<const float *>(input), static_cast<const float *>(input) + frames_per_buffer); { std::lock_guard<std::mutex> lock(self->queueMutex); self->audioQueue.push(buffer); } self->queueCondition.notify_one(); return paContinue; } void AudioCapture::sendAudioStream( std::shared_ptr<audio_stream::AudioStream::Stub> stub) { grpc::ClientContext context; audio_stream::StreamResponse response; // Define the response here // Pass the response pointer as the second argument to StreamAudio std::unique_ptr<grpc::ClientWriter<audio_stream::AudioChunk>> writer( stub->StreamAudio(&context, &response)); if (!writer) { logger_->error("Failed to create gRPC writer."); return; } // Sending audio chunks in a stream while (isRunning()) { std::vector<float> chunk = getNextAudioChunk(); if (!chunk.empty()) { audio_stream::AudioChunk audioChunk; // Convert float vector to byte array (audio data) std::string audioData(reinterpret_cast<const char *>(chunk.data()), chunk.size() * sizeof(float)); audioChunk.set_audio_data(audioData); // Fill in other fields like spectral data, energy, etc. audioChunk.add_spectral_data(0.0); // Example spectral data audioChunk.set_energy(0.0); // Example energy audioChunk.set_zero_crossings(0.0); // Example zero crossings audioChunk.set_speech_band_energy(0.0); // Example speech band energy audioChunk.set_voice_detected(false); // Example voice detection audioChunk.set_timestamp(0); // Example timestamp if (!writer->Write(audioChunk)) { logger_->error("Failed to send audio chunk."); break; } } // Optional: sleep to avoid overloading the server std::this_thread::sleep_for(std::chrono::milliseconds(10)); // Adjust as necessary } writer->WritesDone(); // Now calling Finish with response object passed as the first argument grpc::Status status = writer->Finish(); if (!status.ok()) { logger_->error("gRPC failed: " + status.error_message()); } else { // Optionally handle the StreamResponse from the server logger_->info("Audio stream successfully sent and received response."); // If you want to log response details, you can use `response` here. } }

The main

The main executable then connects to a GRPC server (which iss written in Golang) which is out of the scope of this blog post, so I will include the source code for it in the github repository.

#include "audio_capture.h" #include "device_stream.grpc.pb.h" #include "logger.h" #include <cstdlib> #include <grpcpp/grpcpp.h>  int main() { Logger::init(); auto logger = Logger::get(); AudioCapture audioCapture; // Only one instance is needed auto devices = audioCapture.getDevices(); if (devices.empty()) { logger->error("No audio devices found."); return EXIT_FAILURE; } logger->info("Available Audio Devices"); for (const auto &[index, deviceInfo] : devices) { std::string deviceName = "Device " + std::to_string(index) + ": " + deviceInfo->name + "\n"; logger->info(deviceName); logger->warn("Opening audio stream on device " + std::to_string(index)); audioCapture.openAudioStream(index); } // Create a gRPC channel to the server std::shared_ptr<grpc::Channel> channel = grpc::CreateChannel( "localhost:50051", grpc::InsecureChannelCredentials()); // Create the gRPC client stub std::shared_ptr<audio_stream::AudioStream::Stub> stub = audio_stream::AudioStream::NewStub(channel); // Start capturing audio audioCapture.start(); // Send audio stream to the external gRPC server audioCapture.sendAudioStream(stub); return 0; }

Running and building the code

For running and building the code I use a simple Makefile with a few commands:

# Compiler optimizations CMAKE_FLAGS=-G Ninja -DCMAKE_BUILD_TYPE=Release BUILD_DIR=build NUM_CORES=$(shell nproc) .PHONY: build run run_detailed rerun remove rebuild build: @mkdir -p $(BUILD_DIR) @cd $(BUILD_DIR) && cmake $(CMAKE_FLAGS) .. && cmake --build . -j$(NUM_CORES) run: @cd $(BUILD_DIR) && ./audio_capture 2>/dev/null run_detailed: @cd $(BUILD_DIR) && ./audio_capture rerun: rebuild run remove: @rm -rf $(BUILD_DIR) rebuild: @cd $(BUILD_DIR) && cmake --build . --clean-first -j$(NUM_CORES) || { rm -rf $(BUILD_DIR) && make build; }