Source code for nussl.separation.ft2d

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
import copy
from scipy.ndimage.filters import maximum_filter, minimum_filter, uniform_filter

from ..core.audio_signal import AudioSignal
from ..core import constants
from . import mask_separation_base
from . import masks


[docs]class FT2D(mask_separation_base.MaskSeparationBase): """Implements foreground/background separation using the 2D Fourier Transform Parameters: input_audio_signal: (AudioSignal object) The AudioSignal object that has the audio data that REPET will be run on. high_pass_cutoff: (Optional) (float) value (in Hz) for the high pass cutoff filter. do_mono: (Optional) (bool) Flattens AudioSignal to mono before running the algorithm (does not effect the input AudioSignal object) use_librosa_stft: (Optional) (bool) Calls librosa's stft function instead of nussl's """ def __init__(self, input_audio_signal, high_pass_cutoff=100.0, neighborhood_size=(1, 25), do_mono=False, use_librosa_stft=constants.USE_LIBROSA_STFT, quadrants_to_keep=(0,1,2,3), use_background_fourier_transform=True, mask_alpha=1.0, mask_type=mask_separation_base.MaskSeparationBase.SOFT_MASK, filter_approach='local_std'): super(FT2D, self).__init__(input_audio_signal=input_audio_signal, mask_type=mask_type) self.high_pass_cutoff = high_pass_cutoff self.background = None self.foreground = None self.use_librosa_stft = use_librosa_stft self.neighborhood_size = neighborhood_size self.result_masks = None self.quadrants_to_keep = quadrants_to_keep self.use_background_fourier_transform = use_background_fourier_transform self.mask_alpha = mask_alpha self.stft = None allowed_filter_approaches = ['original', 'local_std'] if filter_approach not in allowed_filter_approaches: raise ValueError(f'filter approach must be one of {allowed_filter_approaches}') self.filter_approach = filter_approach if do_mono: self.audio_signal.to_mono(overwrite=True)
[docs] def run(self): """ Returns: background (AudioSignal): An AudioSignal object with repeating background in background.audio_data (to get the corresponding non-repeating foreground run self.make_audio_signals()) Example: :: """ # High pass filter cutoff freq. (in # of freq. bins), +1 to match MATLAB implementation self.high_pass_cutoff = int(np.ceil(self.high_pass_cutoff * (self.stft_params.n_fft_bins - 1) / self.audio_signal.sample_rate)) + 1 self._compute_spectrograms() # separate the mixture background by masking background_masks = [] foreground_masks = [] for ch in range(self.audio_signal.num_channels): background_mask, foreground_mask = self.compute_ft2d_mask(self.ft2d, ch) background_mask[0:self.high_pass_cutoff, :] = 1 # high-pass filter the foreground foreground_mask[0:self.high_pass_cutoff, :] = 0 background_masks.append(background_mask) foreground_masks.append(foreground_mask) background_masks = np.array(background_masks).transpose((1, 2, 0)).astype('float') foreground_masks = np.array(foreground_masks).transpose((1, 2, 0)).astype('float') _masks = [background_masks, foreground_masks] self.result_masks = [] for mask in _masks: mask = masks.SoftMask(mask) if self.mask_type == self.BINARY_MASK: mask = mask.mask_to_binary() self.result_masks.append(mask) return self.result_masks
def _compute_spectrograms(self): self.stft = self.audio_signal.stft(overwrite=True, remove_reflection=True, use_librosa=self.use_librosa_stft) self.ft2d = np.stack([np.fft.fft2(np.abs(self.stft[:, :, i])) for i in range(self.audio_signal.num_channels)], axis = -1)
[docs] def filter_quadrants(self, data): # 1: shape[0] // 2:, :shape[1] // 2 # 2: :shape[0] // 2, :shape[1] // 2 # 3: :shape[0] // 2, shape[1] // 2: # 4: shape[0] // 2:, shape[1] // 2: shape = data.shape for quadrant in range(4): if quadrant not in self.quadrants_to_keep: if quadrant == 0: data[shape[0] // 2:, :shape[1] // 2] = 0 elif quadrant == 1: data[:shape[0] // 2, :shape[1] // 2] = 0 elif quadrant == 2: data[:shape[0] // 2, shape[1] // 2:] = 0 elif quadrant == 3: data[shape[0] // 2:, shape[1] // 2:] = 0 return data
[docs] def compute_ft2d_mask(self, ft2d, ch): if self.filter_approach == 'original': bg_ft2d, fg_ft2d = self.filter_local_maxima(ft2d[:, :, ch]) elif self.filter_approach == 'local_std': bg_ft2d, fg_ft2d = self.filter_local_maxima_with_std(ft2d[:, :, ch]) self.bg_ft2d = self.filter_quadrants(bg_ft2d) self.fg_ft2d = self.filter_quadrants(fg_ft2d) _stft = np.abs(self.stft)[:, :, ch] + 1e-7 _stft = _stft if self.use_background_fourier_transform: ft2d_used = self.bg_ft2d else: ft2d_used = self.fg_ft2d est_stft = np.minimum(np.abs(np.fft.ifft2(ft2d_used)), _stft) est_mask = (est_stft / _stft) ** self.mask_alpha est_mask /= (est_mask + 1e-7).max() if self.use_background_fourier_transform: bg_mask = est_mask fg_mask = 1 - bg_mask else: fg_mask = est_mask bg_mask = 1 - fg_mask return bg_mask, fg_mask
[docs] def filter_local_maxima_with_std(self, ft2d): data = np.abs(np.fft.fftshift(ft2d)) data /= (np.max(data) + 1e-7) data_max = maximum_filter(data, self.neighborhood_size) data_min = minimum_filter(data, self.neighborhood_size) data_mean = uniform_filter(data, self.neighborhood_size) data_mean_sq = uniform_filter(data ** 2, self.neighborhood_size) data_std = np.sqrt(data_mean_sq - data_mean ** 2) + 1e-7 maxima = ((data_max - data_mean) / data_std) fraction_of_local_max = (data == data_max) maxima *= fraction_of_local_max maxima = maxima.astype(float) maxima /= (np.max(maxima) + 1e-7) maxima = np.maximum(maxima, np.fliplr(maxima), np.flipud(maxima)) maxima = np.fft.ifftshift(maxima) background_ft2d = np.multiply(maxima, ft2d) foreground_ft2d = np.multiply(1 - maxima, ft2d) return background_ft2d, foreground_ft2d
[docs] def filter_local_maxima(self, ft2d): data = np.abs(np.fft.fftshift(ft2d)) data /= (np.max(data) + 1e-7) threshold = np.std(data) data_max = maximum_filter(data, self.neighborhood_size) maxima = (data == data_max) data_min = minimum_filter(data, self.neighborhood_size) diff = ((data_max - data_min) > threshold) maxima[diff == 0] = 0 maxima = np.maximum(maxima, np.fliplr(maxima), np.flipud(maxima)) maxima = np.fft.ifftshift(maxima) background_ft2d = np.multiply(maxima, ft2d) foreground_ft2d = np.multiply(1 - maxima, ft2d) return background_ft2d, foreground_ft2d
[docs] def make_audio_signals(self): """ Returns the background and foreground audio signals. You must have run FT2D.run() prior to calling this function. This function will return None if run() has not been called. Returns: Audio Signals (List): 2 element list. * bkgd: Audio signal with the calculated background track * fkgd: Audio signal with the calculated foreground track EXAMPLE: :: """ sources = [] for mask in self.result_masks: source = self.audio_signal.apply_mask(mask) source.istft( overwrite=True, truncate_to_length=self.audio_signal.signal_length ) sources.append(source) return sources