Skip to content

Instantly share code, notes, and snippets.

@neilslater
Last active December 16, 2015 18:09
Show Gist options
  • Save neilslater/5476045 to your computer and use it in GitHub Desktop.
Save neilslater/5476045 to your computer and use it in GitHub Desktop.
Rough and ready spectrogram code
require 'narray'
require 'fftw3'
require 'raiff'
require 'rmagick'
# This class models audio power spectrum generation from audio sources (as .aiff files)
# An instance of this class represents a single audio clip and its spectrogram
class AudioSpectrum
# TODO: Over time, we want to improve on these crude hard-coded values, and either take them
# as params (e.g. number of windows, min/max frequency to store), or read from the audio file
# In addition, we probably need to work with files large enough that slurping them in one
# go is too much. There should be a slurp max size.
SAMPLE_TIME = 0.0005
SAMPLE_FREQ = 2000.0
SAMPLE_MAX = 128 * 256
SAMPLE_SCALE = 1.0 / SAMPLE_MAX
NSAMPLES = 4000
# This window and "height" takes us to half Nyquist limit, or 500Hz, for the spectrogram
DEFAULT_FFT_WINDOW_SIZE = 256
DEFAULT_WINDOW_STEP_SIZE = 32
DEFAULT_NUM_WINDOWS = 64
DEFAULT_LOWEST_FREQ = 0
DEFAULT_HIGHEST_FREQ = 64
PI = Math.acos(0.0) * 2
@@hann_window = nil
@@bh_window = nil
@@diamond = nil
# file must be a mono, 2kHz, 2s audio clip in .aiff format. More formats later (maybe)
def initialize file = nil
# @offsets, @audio and @spectrogram all assume that the entire audio is processed at once,
# we probably want something longer-term that handles just parts of audio streams. Probably a fixed
# offset in samples or time . . .
@fft_window_size = DEFAULT_FFT_WINDOW_SIZE
@sample_start = 0
@sample_jitter = 1.0
@window_step_size = DEFAULT_WINDOW_STEP_SIZE
@window_count = DEFAULT_NUM_WINDOWS
@lowest_freq = DEFAULT_LOWEST_FREQ
@highest_freq = DEFAULT_HIGHEST_FREQ
@window_type = :hann
@audio = NArray.float( NSAMPLES )
if file
raise "File not found #{file}" unless File.exists?(file)
load_audio file
end
@normalise_type = :whole
end
attr_accessor :fft_window_size
# the sample at which to start the first FFT window, defaults to 0
attr_accessor :sample_start
# size of random offset to each sample, expressed in multiples of smallest same size, default 1.0
# a value of 1.0 should reduce systemic noise caused by limited bit depth (at the expense of introducing white noise)
attr_accessor :sample_jitter
# number of samples offset for each window with frequency breakdown, default 32. Floats are allowed
attr_accessor :window_step_size
# number of samples offset for each window with frequency breakdown, default 64
attr_accessor :window_count
# lowest frequency bin to use for the spectrogram, default 0
attr_accessor :lowest_freq
# highest frequency bin to use for the spectrogram, default 64
attr_accessor :highest_freq
# type of normalisation to use for the spectrogram, default :whole. Can also be :none, :freq or :time
attr_accessor :normalise_type
# type of windowing function, choice of :hann or :blackman_harris
attr_accessor :window_type
# number of frequencies, default 65
attr_reader :spectrogram_height
# an NArray of 4000 audio samples, range from to -1.0 to 1.0
attr_reader :audio
# Integer number of audio samples used to create each timeslice of data. Frequency resolution is better with larger windows,
# but time resolution is worse.
attr_reader :fft_window_size
# a 2-dimensional NArray of window_count x spectrogram_height frequencies, giving normalised relative intensity at each frequency.
def spectrogram
@spectrogram_height = 1 + @highest_freq - @lowest_freq
@offsets = calc_offsets @sample_start, @window_step_size, @window_count
@spectrogram = NArray.float( @window_count, @spectrogram_height )
normalised_spectrogram
@spectrogram
end
# writes spectrogram to image file, time series left to right, highest frequency at top, one pixel
# per "bin"
def write_spectrogram_graphic filename
spectrogram unless @spectrogram
width = @offsets.size
height = @spectrogram_height
img = Magick::Image.new(width, height)
(0...width).each do |x|
(0...height).each do |y|
pixel_x = x
pixel_y = @spectrogram_height - 1 - y
pixel_colour = intensity_color( @spectrogram[x,y] )
img.pixel_color( pixel_x, pixel_y, "rgb(#{pixel_colour.join(', ')})")
end
end
img.write(filename)
end
private
def intensity_color i
gamma = 1.0/1.3
l = 255.0 * ( i ** gamma )
[ Integer(l), Integer(l), Integer(i * 255) ]
end
def load_audio full_path
pos = 0
aiff = Raiff.open( full_path )
aiff.each_sample do |x|
@audio[pos] = SAMPLE_SCALE * x[0]
pos += 1
end
if @sample_jitter > 0.0
@audio += jitter_noise
end
@audio
end
def fft_window window_id
# Take a @fft_window_size-sized slice to analyse
win_start = @offsets[window_id]
win_end = win_start + @fft_window_size
audio_window = @audio[win_start...win_end]
# Apply a window function
if @window_type == :hann
ensure_hann_window
audio_window = audio_window * @@hann_window
elsif @window_type == :blackman_harris
ensure_blackman_harris_window
audio_window = audio_window * @@bh_window
end
freqs = FFTW3.fft(audio_window, -1)
base_factor = 2.0/@fft_window_size
@spectrogram[window_id,0...@spectrogram_height] = freqs[@lowest_freq...@lowest_freq+@spectrogram_height].abs * base_factor
max = @spectrogram[window_id,0...@spectrogram_height].max
if @normalise_type == :time
norm_factor = 1.0/(max + 1.0e-6)
@spectrogram[window_id,0...@spectrogram_height] *= norm_factor
end
max
end
def normalised_spectrogram
max = 0.0
# Collect data
(0...@offsets.size).each do |window_id|
v = fft_window( window_id )
max = v if v > max
end
# Normalise spectrogram, highest value will be ~0.99999
if @normalise_type == :whole
norm_factor = 1.0/(max + 1.0e-6)
@spectrogram *= norm_factor
elsif @normalise_type == :freq
(0...@spectrogram_height).each do |freq_id|
freq_max = @spectrogram[0...@offsets.size,freq_id].max
norm_factor = 1.0/(freq_max + 1.0e-6)
@spectrogram[0...@offsets.size,freq_id] *= norm_factor
end
elsif @normalise_type == :centre_rect
calc_weighted_mean_rect
weighted_mean = (@spectrogram * @@diamond).sum
norm_factor = 0.25/(weighted_mean + 1.0e-6)
@spectrogram *= norm_factor
# This clips the absoltue values
@spectrogram[@spectrogram.gt 1.0] = 1.0
end
@spectrogram
end
def freq_samples freq_id
@spectrogram[0...@offsets.size,freq_id]
end
def calc_offsets start, step, count
(0...count).map do |x|
(start + x * step).round
end
end
def ensure_hann_window
return if @@hann_window && @@hann_window.size == @fft_window_size
@@hann_window = NArray.float( @fft_window_size )
pi_invw = ( 2.0 * PI )/( @fft_window_size.to_f - 1.0 )
(0...@fft_window_size).each do |i|
@@hann_window[i] = 0.5 * (1.0 - Math.cos( i.to_f * pi_invw ))
end
end
def ensure_blackman_harris_window
return if @@bh_window && @@bh_window.size == @fft_window_size
@@bh_window = NArray.float( @fft_window_size )
# pi_invw -> 2pi/(N-1)
pi_invw = ( 2.0 * PI )/( @fft_window_size.to_f - 1.0 )
a0 = 0.35875
a1 = 0.48829
a2 = 0.14128
a3 = 0.01168
(0...@fft_window_size).each do |i|
@@bh_window[i] = a0 - a1 * Math.cos( i.to_f * pi_invw ) + a2 * Math.cos( (2 * i).to_f * pi_invw ) - a3 * Math.cos( (3 * i).to_f * pi_invw )
end
end
# Creates a square shape of floats where the sum is 1.0, which can then be used to get a weighted central mean of a spectrogram
def calc_weighted_mean_rect
return @@diamond if @@diamond
@@diamond = NArray.float( @window_count, @spectrogram_height )
# This is placeholder code for the simplest central rect, covering very roughly the central 1/3 (by area) of the spectrogram
lowest_window = Integer(@window_count * 0.21)
highest_window = Integer(@window_count * 0.79) + 1
lowest_freq = Integer(@spectrogram_height * 0.21)
highest_freq = Integer(@spectrogram_height * 0.79) + 1
npixels = (highest_window - lowest_window) * (highest_freq - lowest_freq)
@@diamond[lowest_window..highest_window,lowest_freq..highest_freq] = 1.0/npixels
end
def jitter_noise
jitter = ( NArray.float( NSAMPLES ).random() - 0.5 ) * ( SAMPLE_SCALE * @sample_jitter )
jitter
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment