Last active
December 16, 2015 18:09
-
-
Save neilslater/5476045 to your computer and use it in GitHub Desktop.
Rough and ready spectrogram code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'narray' | |
require 'fftw3' | |
require 'raiff' | |
require 'rmagick' | |
# This class models audio power spectrum generation from audio sources (as .aiff files) | |
# An instance of this class represents a single audio clip and its spectrogram | |
class AudioSpectrum | |
# TODO: Over time, we want to improve on these crude hard-coded values, and either take them | |
# as params (e.g. number of windows, min/max frequency to store), or read from the audio file | |
# In addition, we probably need to work with files large enough that slurping them in one | |
# go is too much. There should be a slurp max size. | |
SAMPLE_TIME = 0.0005 | |
SAMPLE_FREQ = 2000.0 | |
SAMPLE_MAX = 128 * 256 | |
SAMPLE_SCALE = 1.0 / SAMPLE_MAX | |
NSAMPLES = 4000 | |
# This window and "height" takes us to half Nyquist limit, or 500Hz, for the spectrogram | |
DEFAULT_FFT_WINDOW_SIZE = 256 | |
DEFAULT_WINDOW_STEP_SIZE = 32 | |
DEFAULT_NUM_WINDOWS = 64 | |
DEFAULT_LOWEST_FREQ = 0 | |
DEFAULT_HIGHEST_FREQ = 64 | |
PI = Math.acos(0.0) * 2 | |
@@hann_window = nil | |
@@bh_window = nil | |
@@diamond = nil | |
# file must be a mono, 2kHz, 2s audio clip in .aiff format. More formats later (maybe) | |
def initialize file = nil | |
# @offsets, @audio and @spectrogram all assume that the entire audio is processed at once, | |
# we probably want something longer-term that handles just parts of audio streams. Probably a fixed | |
# offset in samples or time . . . | |
@fft_window_size = DEFAULT_FFT_WINDOW_SIZE | |
@sample_start = 0 | |
@sample_jitter = 1.0 | |
@window_step_size = DEFAULT_WINDOW_STEP_SIZE | |
@window_count = DEFAULT_NUM_WINDOWS | |
@lowest_freq = DEFAULT_LOWEST_FREQ | |
@highest_freq = DEFAULT_HIGHEST_FREQ | |
@window_type = :hann | |
@audio = NArray.float( NSAMPLES ) | |
if file | |
raise "File not found #{file}" unless File.exists?(file) | |
load_audio file | |
end | |
@normalise_type = :whole | |
end | |
attr_accessor :fft_window_size | |
# the sample at which to start the first FFT window, defaults to 0 | |
attr_accessor :sample_start | |
# size of random offset to each sample, expressed in multiples of smallest same size, default 1.0 | |
# a value of 1.0 should reduce systemic noise caused by limited bit depth (at the expense of introducing white noise) | |
attr_accessor :sample_jitter | |
# number of samples offset for each window with frequency breakdown, default 32. Floats are allowed | |
attr_accessor :window_step_size | |
# number of samples offset for each window with frequency breakdown, default 64 | |
attr_accessor :window_count | |
# lowest frequency bin to use for the spectrogram, default 0 | |
attr_accessor :lowest_freq | |
# highest frequency bin to use for the spectrogram, default 64 | |
attr_accessor :highest_freq | |
# type of normalisation to use for the spectrogram, default :whole. Can also be :none, :freq or :time | |
attr_accessor :normalise_type | |
# type of windowing function, choice of :hann or :blackman_harris | |
attr_accessor :window_type | |
# number of frequencies, default 65 | |
attr_reader :spectrogram_height | |
# an NArray of 4000 audio samples, range from to -1.0 to 1.0 | |
attr_reader :audio | |
# Integer number of audio samples used to create each timeslice of data. Frequency resolution is better with larger windows, | |
# but time resolution is worse. | |
attr_reader :fft_window_size | |
# a 2-dimensional NArray of window_count x spectrogram_height frequencies, giving normalised relative intensity at each frequency. | |
def spectrogram | |
@spectrogram_height = 1 + @highest_freq - @lowest_freq | |
@offsets = calc_offsets @sample_start, @window_step_size, @window_count | |
@spectrogram = NArray.float( @window_count, @spectrogram_height ) | |
normalised_spectrogram | |
@spectrogram | |
end | |
# writes spectrogram to image file, time series left to right, highest frequency at top, one pixel | |
# per "bin" | |
def write_spectrogram_graphic filename | |
spectrogram unless @spectrogram | |
width = @offsets.size | |
height = @spectrogram_height | |
img = Magick::Image.new(width, height) | |
(0...width).each do |x| | |
(0...height).each do |y| | |
pixel_x = x | |
pixel_y = @spectrogram_height - 1 - y | |
pixel_colour = intensity_color( @spectrogram[x,y] ) | |
img.pixel_color( pixel_x, pixel_y, "rgb(#{pixel_colour.join(', ')})") | |
end | |
end | |
img.write(filename) | |
end | |
private | |
def intensity_color i | |
gamma = 1.0/1.3 | |
l = 255.0 * ( i ** gamma ) | |
[ Integer(l), Integer(l), Integer(i * 255) ] | |
end | |
def load_audio full_path | |
pos = 0 | |
aiff = Raiff.open( full_path ) | |
aiff.each_sample do |x| | |
@audio[pos] = SAMPLE_SCALE * x[0] | |
pos += 1 | |
end | |
if @sample_jitter > 0.0 | |
@audio += jitter_noise | |
end | |
@audio | |
end | |
def fft_window window_id | |
# Take a @fft_window_size-sized slice to analyse | |
win_start = @offsets[window_id] | |
win_end = win_start + @fft_window_size | |
audio_window = @audio[win_start...win_end] | |
# Apply a window function | |
if @window_type == :hann | |
ensure_hann_window | |
audio_window = audio_window * @@hann_window | |
elsif @window_type == :blackman_harris | |
ensure_blackman_harris_window | |
audio_window = audio_window * @@bh_window | |
end | |
freqs = FFTW3.fft(audio_window, -1) | |
base_factor = 2.0/@fft_window_size | |
@spectrogram[window_id,0...@spectrogram_height] = freqs[@lowest_freq...@lowest_freq+@spectrogram_height].abs * base_factor | |
max = @spectrogram[window_id,0...@spectrogram_height].max | |
if @normalise_type == :time | |
norm_factor = 1.0/(max + 1.0e-6) | |
@spectrogram[window_id,0...@spectrogram_height] *= norm_factor | |
end | |
max | |
end | |
def normalised_spectrogram | |
max = 0.0 | |
# Collect data | |
(0...@offsets.size).each do |window_id| | |
v = fft_window( window_id ) | |
max = v if v > max | |
end | |
# Normalise spectrogram, highest value will be ~0.99999 | |
if @normalise_type == :whole | |
norm_factor = 1.0/(max + 1.0e-6) | |
@spectrogram *= norm_factor | |
elsif @normalise_type == :freq | |
(0...@spectrogram_height).each do |freq_id| | |
freq_max = @spectrogram[0...@offsets.size,freq_id].max | |
norm_factor = 1.0/(freq_max + 1.0e-6) | |
@spectrogram[0...@offsets.size,freq_id] *= norm_factor | |
end | |
elsif @normalise_type == :centre_rect | |
calc_weighted_mean_rect | |
weighted_mean = (@spectrogram * @@diamond).sum | |
norm_factor = 0.25/(weighted_mean + 1.0e-6) | |
@spectrogram *= norm_factor | |
# This clips the absoltue values | |
@spectrogram[@spectrogram.gt 1.0] = 1.0 | |
end | |
@spectrogram | |
end | |
def freq_samples freq_id | |
@spectrogram[0...@offsets.size,freq_id] | |
end | |
def calc_offsets start, step, count | |
(0...count).map do |x| | |
(start + x * step).round | |
end | |
end | |
def ensure_hann_window | |
return if @@hann_window && @@hann_window.size == @fft_window_size | |
@@hann_window = NArray.float( @fft_window_size ) | |
pi_invw = ( 2.0 * PI )/( @fft_window_size.to_f - 1.0 ) | |
(0...@fft_window_size).each do |i| | |
@@hann_window[i] = 0.5 * (1.0 - Math.cos( i.to_f * pi_invw )) | |
end | |
end | |
def ensure_blackman_harris_window | |
return if @@bh_window && @@bh_window.size == @fft_window_size | |
@@bh_window = NArray.float( @fft_window_size ) | |
# pi_invw -> 2pi/(N-1) | |
pi_invw = ( 2.0 * PI )/( @fft_window_size.to_f - 1.0 ) | |
a0 = 0.35875 | |
a1 = 0.48829 | |
a2 = 0.14128 | |
a3 = 0.01168 | |
(0...@fft_window_size).each do |i| | |
@@bh_window[i] = a0 - a1 * Math.cos( i.to_f * pi_invw ) + a2 * Math.cos( (2 * i).to_f * pi_invw ) - a3 * Math.cos( (3 * i).to_f * pi_invw ) | |
end | |
end | |
# Creates a square shape of floats where the sum is 1.0, which can then be used to get a weighted central mean of a spectrogram | |
def calc_weighted_mean_rect | |
return @@diamond if @@diamond | |
@@diamond = NArray.float( @window_count, @spectrogram_height ) | |
# This is placeholder code for the simplest central rect, covering very roughly the central 1/3 (by area) of the spectrogram | |
lowest_window = Integer(@window_count * 0.21) | |
highest_window = Integer(@window_count * 0.79) + 1 | |
lowest_freq = Integer(@spectrogram_height * 0.21) | |
highest_freq = Integer(@spectrogram_height * 0.79) + 1 | |
npixels = (highest_window - lowest_window) * (highest_freq - lowest_freq) | |
@@diamond[lowest_window..highest_window,lowest_freq..highest_freq] = 1.0/npixels | |
end | |
def jitter_noise | |
jitter = ( NArray.float( NSAMPLES ).random() - 0.5 ) * ( SAMPLE_SCALE * @sample_jitter ) | |
jitter | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment