Python Nerdery: Audio RSTP AI Listener

Today’s python experiment came after hearing an explosion outside. We never identified what it was, but it led me to be curious if I could create an AI listener to the camera. Long story short, python with numpy, pandas and tensorflow are all you need to read the audio from an RSTP feed and listen for audio events.

import subprocess
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import os

# Load the YAMNet model from TensorFlow Hub
yamnet_model = hub.load('https://tfhub.dev/google/yamnet/1')

# Load the class map from the local file
local_class_map_path = os.path.join(os.path.dirname(__file__), 'yamnet_class_map.csv')

# Check if the file exists
if not os.path.exists(local_class_map_path):
    raise FileNotFoundError(f"Class map file not found at {local_class_map_path}")

# Load class names into a list
class_names = pd.read_csv(local_class_map_path)['display_name'].to_list()

def start_ffmpeg(rtsp_url):
    command = [
        'ffmpeg',
        '-i', rtsp_url,
        '-vn',  # Skip video processing
        '-acodec', 'pcm_s16le',  # Use raw 16-bit little-endian PCM audio
        '-ar', '16000',  # Set sample rate to 16 kHz
        '-ac', '1',  # Set number of audio channels to 1 (mono)
        '-f', 's16le',  # Set output format to raw PCM data
        'pipe:1'  # Output to stdout
    ]
    return subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, bufsize=10**8)

def process_audio(ffmpeg_process):
    chunk_size = 16000 * 1  # Number of samples for 1 second of audio
    while True:
        data = ffmpeg_process.stdout.read(chunk_size * 2)  # 2 bytes per sample (16-bit audio)
        if not data:
            break
        # Convert raw audio data to numpy array
        audio_samples = np.frombuffer(data, dtype=np.int16).astype(np.float32) / 32768.0  # Normalize to [-1.0, 1.0]

        # Run the model
        scores, embeddings, spectrogram = yamnet_model(audio_samples)
        scores = scores.numpy()

        # Aggregate scores across time frames
        mean_scores = np.mean(scores, axis=0)

        # Get the top N predictions
        top_N = 5
        top_class_indices = np.argsort(mean_scores)[::-1][:top_N]

        print("Detected sounds:")
        for i in top_class_indices:
            print(f"{class_names[i]}: {mean_scores[i]:.3f}")
        print("-" * 40)

def main():
    rtsp_url = 'rtsp://user:pass@192.168.1.2:554/cam/realmonitor?channel=1&subtype=0'

    ffmpeg_process = start_ffmpeg(rtsp_url)

    try:
        process_audio(ffmpeg_process)
    except KeyboardInterrupt:
        print("Stopping...")
    finally:
        ffmpeg_process.terminate()

if __name__ == '__main__':
    main()