Source code for scripts.reorganize

import csv
import os
import shutil
from src.utils.parallel import parallel_process
from multiprocessing import cpu_count
from runners.utils import load_yaml, parse_yaml
from . import cmd, document_parser
import glob
import logging
import argparse
import yaml
import sys

[docs]def split_urbansound_by_fold(path_to_file, output_directory, input_directory, make_copy=False, train_folds=[1, 2, 3, 4, 5, 6, 7, 8], val_folds=[9], test_folds=[10], path_to_urbansound_csv=None): """ Reorganizes the urbansound dataset using the metadata/UrbanSound8K.csv to determine which fold each file belongs to. It makes symlinks in the corresponding train, test, and val folders. Args: path_to_file (str): Path to the audio file that will be reorganized. Has form /path/to/mixture_name/source_name.ext output_directory (str): Where the file after swapping the mixture_name and source_name will be copied to. input_directory (str): The root of the directory that the file comes from. Useful for figuring out the relative path with respect to the input directory for copying to the output_directory. make_copy (bool, optional): Whether to use a symlink or to actually copy the file. Defaults to False. train_folds (list, optional): Which folds belong to the train set. Defaults to [1, 2, 3, 4, 5, 6, 7, 8]. val_folds (list, optional): Which folds belong to the validation set. Defaults to [9]. test_folds (list, optional): Which folds belong to the test set. Defaults to [10]. path_to_urbansound_csv ([type]): Path to metadata/UrbanSound8k.csv. Defaults to None. Raises: ValueError: raises an error if the path to the csv isn't given. """ raise NotImplementedError() if not path_to_urbansound_csv: raise ValueError("Path to urban sound CSV must be specified!") # Below doesn't work yet, just copying from the old stuff. for d in ['train', 'validation', 'test']: os.makedirs( os.path.join(data_directory, 'data', d), exist_ok=True) def copy_audio_to_folder_of_class(row): target_directory = data_directory class_name = row['class'] source_file = os.path.join(data_directory, 'audio', f"fold{row['fold']}", row['slice_file_name']) if int(row['fold']) in train_folds: target_directory = os.path.join(target_directory, 'train', class_name) elif int(row['fold']) in val_folds: target_directory = os.path.join(target_directory, 'validation', class_name) else: target_directory = os.path.join(target_directory, 'test', class_name) os.makedirs(target_directory, exist_ok=True) target_file = os.path.join(target_directory, row['slice_file_name']) print(f"Copying {source_file} w/ fold {row['fold']} to {target_file}", flush=True) shutil.copyfile(source_file, target_file) with open(os.path.join(data_directory, 'metadata', 'UrbanSound8K.csv'), 'r') as f: reader = csv.DictReader(f) rows = list(reader)
[docs]def split_folder_by_file(path_to_file, output_directory, input_directory, org_file, make_copy=False): """ Reorganizes a directory using a organization file. The organization file should contain a list of paths that are relative to the input_directory. If path_to_file is in the organization file, then it will be symlinked (or moved) to the same relative path in output_directory. For example if organization file has an entry:: path/to/my/file/0.wav And path to file looks like:: input_directory/path/to/my/file/0.wav Then a new file will be created (or symlinked) at:: output_directory/path/to/my/file/0.wav Args: path_to_file (str): Path to the audio file that will be reorganized. output_directory (str): Where the file after swapping the mixture_name and source_name will be copied to. input_directory (str): The root of the directory that the file comes from. Useful for figuring out the relative path with respect to the input directory for copying to the output_directory. org_file (str): Path to the file containing all of the file names that should be moved. make_copy (bool, optional): Whether to use a symlink or to actually copy the file. Defaults to False. """ with open(org_file, 'r') as f: files = f.readlines() files = [x.strip() for x in files] files = set(files) relative_path = path_to_file.split(input_directory)[-1][1:] if relative_path in files: output_path = os.path.join(output_directory, relative_path) os.makedirs(os.path.dirname(output_path), exist_ok=True) if not os.path.exists(output_path): if make_copy: shutil.copyfile(path_to_file, output_path) else: os.symlink(path_to_file, output_path)
[docs]def split_folder_by_class(path_to_file, output_directory, input_directory, make_copy=False): """Splits a folder by class which is indicated by the name of the file. The mixture name is the name of the parent directory to the file. This function is used to organize datasets like musdb for consumption by Scaper for mixing new datasets. Takes a folder with audio file structure that looks like this:: folder_input/ mixture_one_name/ vocals.wav bass.wav drums.wav other.wav mixture_two_name/ vocals.wav bass.wav drums.wav other.wav ... and reorganizes it to a different folder like so:: folder_output/ vocals/ mixture_one_name.wav mixture_two_name.wav ... bass/ mixture_one_name.wav mixture_two_name.wav ... drums/ mixture_one_name.wav mixture_two_name.wav ... other/ mixture_one_name.wav mixture_two_name.wav ... so that it can be processed easily by Scaper. Notably, MUSDB has this folder structure. This reorganization is done via symlinks so that the entire dataset is not copied. Args: path_to_file (str): Path to the audio file that will be reorganized. Has form /path/to/mixture_name/source_name.ext output_directory (str): Where the file after swapping the mixture_name and source_name will be copied to. input_directory (str): The root of the directory that the file comes from. Useful for figuring out the relative path with respect to the input directory for copying to the output_directory. make_copy (bool): Whether to use a symlink or to actually copy the file. Defaults to False. """ head, tail = os.path.split(path_to_file) class_name, ext = os.path.splitext(tail) head, mixture_name = os.path.split(head) output_path = os.path.join(output_directory, class_name, mixture_name + ext) os.makedirs(os.path.join(output_directory, class_name), exist_ok=True) if not os.path.exists(output_path): if make_copy: shutil.copyfile(path_to_file, output_path) else: os.symlink(path_to_file, output_path)
[docs]def reorganize(input_path, output_path, org_func, make_copy=False, audio_extensions=['.wav', '.mp3', '.aac'], **kwargs): """ Reorganizes the folders in the input path into the output path given an organization function, passed in by org_func. Args: input_path (str): Root of folder where all audio files will be reorganized. output_path (str): Root of folder where the reorganized files will be placed. org_func (str): Organization function to use reorganize the dataset. Should correspond to the name of a function in reorganize.py. make_copy (bool): Whether to use a symlink or to actually copy the file. Defaults to False. audio_extensions (list, optional): Audio extensions to look for in the input_path. Matching ones will be reorganize and placed into the output directory via a symlink.. Defaults to ['.wav', '.mp3', '.aac']. kwargs (dict): Additional keyword arguments that are passed to the org_func that is specified. """ paths_to_files = [] for ext in audio_extensions: paths_to_files += glob.glob(f'{input_path}/**/*{ext}') args = [{ 'path_to_file': p, 'output_directory': output_path, 'input_directory': input_path, 'make_copy': make_copy, **kwargs } for p in paths_to_files] module = sys.modules[__name__] org_func = getattr(module, org_func) parallel_process( args, org_func, n_jobs=cpu_count(), front_num=1, use_kwargs=True )
[docs]@document_parser('reorganize', 'scripts.reorganize.reorganize') def build_parser(): parser = argparse.ArgumentParser() parser.add_argument( '--input_path', type=str, help="""Root of folder where all audio files will be reorganized.""" ) parser.add_argument( '--output_path', type=str, help="""Root of folder where all reorganized files will be placed.""" ) parser.add_argument( '--org_func', type=str, help="""Organization function to use reorganize the dataset. Should correspond to the name of a function in reorganize.py.""" ) parser.add_argument( '--make_copy', action="store_true", help="""Whether to use a symlink or to actually copy the file.""", ) parser.add_argument( '--audio_extensions', nargs='+', help="""Audio extensions to look for in the input_path. Matching ones will be reorganize and placed into the output directory via a symlink.""", default=['.wav', '.mp3', '.aac'] ) return parser
if __name__ == '__main__': cmd(reorganize, build_parser)