From 8d7309567179761994a29a396fb772016ffce409 Mon Sep 17 00:00:00 2001 From: Julian Linke <jlinke-tug@aim-gpu6.spsc.tugraz.at> Date: Thu, 27 Jul 2023 20:23:23 +0200 Subject: [PATCH] initial commit --- local/codebook_combine_arrays.py | 63 +++++++++++++ local/codebook_freqs.py | 136 +++++++++++++++++++++++++++ local/create_xlsr_new.py | 23 +++++ local/plot_pca_similarities.py | 156 +++++++++++++++++++++++++++++++ local/prepare_data.py | 53 +++++++++++ local/similarity_matrix.py | 96 +++++++++++++++++++ path.sh | 2 + run.sh | 93 ++++++++++++++++++ 8 files changed, 622 insertions(+) create mode 100644 local/codebook_combine_arrays.py create mode 100644 local/codebook_freqs.py create mode 100644 local/create_xlsr_new.py create mode 100644 local/plot_pca_similarities.py create mode 100644 local/prepare_data.py create mode 100644 local/similarity_matrix.py create mode 100755 path.sh create mode 100755 run.sh diff --git a/local/codebook_combine_arrays.py b/local/codebook_combine_arrays.py new file mode 100644 index 0000000..bde5ca8 --- /dev/null +++ b/local/codebook_combine_arrays.py @@ -0,0 +1,63 @@ +# Author: Julian Linke (linke@tugraz.at) +# SPSC TU Graz (July 2023) + +import os, sys +import fairseq +import torch, torchaudio +import numpy as np +import json + +def main(exp_path, json_path): + # load json ... + # dict with {"corpusA_speakingstyle": [spk1 spk2,...], + # "corpusB_speakingstyle": [spk1 spk2,...], ...} + with open(json_path, 'r') as f: + corpora = json.load(f) + + cnt = 0 + for corpus in corpora: + freqN_path = os.path.join(exp_path, corpus) + for freqN_vec in os.listdir(freqN_path): + if 'freqN' in freqN_vec: + cnt = cnt+1 + print('counted {} freqN-files'.format(cnt)) + + freqN_convs = np.zeros((cnt, 320**2), dtype=np.float32) + spks_vec = np.zeros((cnt,), dtype=object) + idx = 0 + print('... start combining freqN_*.npy-files coming from path {}'.format(exp_path)) + for corpus in corpora: + style = corpus.split("_")[1] # second entry is always style + freqN_path = os.path.join(exp_path, corpus) + for freqN_vec in os.listdir(freqN_path): + if 'freqN' in freqN_vec: + #print(freqN_vec) + split = freqN_vec.split('_')[1].replace('.npy','') + print('read and append split {} ...'.format(os.path.join(freqN_path,freqN_vec))) + # combine + freqN_convs[idx,:] = np.load(os.path.join(freqN_path,freqN_vec)) + spks_vec[idx] = f'{split}{style}' + idx = idx + 1 + freq_path, splits_path = os.path.join(exp_path, 'splits_freqs'), os.path.join(exp_path, 'splits_labels') + print('\nwrite {}.npy and {}.npy'.format(freq_path, splits_path)) + np.save(freq_path, freqN_convs) + np.save(splits_path, spks_vec) + print('\nwrite {}.tsv and {}.tsv'.format(freq_path, splits_path)) + np.savetxt(freq_path+'.tsv', freqN_convs, delimiter='\t') + np.savetxt(splits_path+'.tsv', spks_vec, fmt='%s', delimiter='\t') + +if __name__ == "__main__": + ############################# EXP PATH ########################### + try: + exp_path = sys.argv[1] + print("exp path is: " + exp_path) + except: + print("ERROR: data_path not specified") + ############################# JSON PATH ########################### + try: + json_path = sys.argv[2] + print("json path is: " + json_path) + except: + print("ERROR: json_path not specified") + + main(exp_path, json_path) diff --git a/local/codebook_freqs.py b/local/codebook_freqs.py new file mode 100644 index 0000000..0b7e2ea --- /dev/null +++ b/local/codebook_freqs.py @@ -0,0 +1,136 @@ +# Author: Julian Linke (linke@tugraz.at) +# SPSC TU Graz (July 2023) + +import os, sys +import fairseq +import torch, torchaudio +import numpy as np +import json + +def print_list(l): + for s in l: + print(s) + +def count_freq(codebook_all_indexes, freq): + seq = [] + for t in range(0, len(codebook_all_indexes)): + codebook_idx_at_time_t = codebook_all_indexes[t].item() # tensor(integer).item() + if VERBOSE: + print('(Verbose) frame={}: used codebook entry: {}'.format(t+1, codebook_idx_at_time_t)) + seq.append(codebook_idx_at_time_t) + freq[codebook_idx_at_time_t] = freq[codebook_idx_at_time_t] + 1 + return freq, seq + +def calc_codebook_indexes(audio_path, freq, N): + x, fs = torchaudio.load(audio_path) + x = x.to(device) # torch.Size([1, 57120]) [1 x Samples] + + if np.shape(x)[1] > 512: + C = model.quantize(x) + quantized_features = C[0][0] # torch.Size([178, 768]) [T x d] + codebook_G2_indices = C[1] # torch.Size([1, 178, 2]) [1 x T x G]; G=2 + codebook_all_indexes = model.quantizer.to_codebook_index(codebook_G2_indices)[0] # torch.Size([178]) [T] + Nwav = len(codebook_all_indexes) + freq, seq = count_freq(codebook_all_indexes, freq) + N = N + Nwav + print('feature vectors: {}/{} (file/all)'.format(Nwav, N)) + else: + print('WARNING: Input size of file is {} (smaller than Kernel size), skip ...'.format(len(x))) + return freq, Nwav + +def main(exp_path, lst_path, json_path, model_path, VERBOSE): + # set model and device global + global model, device + device = torch.device('cuda') + # load existing model + model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([model_path]) + model = model[0] + model = model.to(device) + # load json ... + # dict with {"corpusA_speakingstyle": [spk1 spk2,...], + # "corpusB_speakingstyle": [spk1 spk2,...], ...} + with open(json_path, 'r') as f: + corpora = json.load(f) + # corpus loop + for corpus in corpora.keys(): + # LOGFILE: + if VERBOSE: + sys.stdout = open(os.path.join(exp_path, 'logs', "codebook_freqs_{}.log".format(corpus)),"w") + # speaker list and style + spks = corpora[corpus] + style = corpus.split("_")[1] # second entry is always style + # prepare numpy arrays + freqN_spks = np.zeros((len(spks), 320**2), dtype=np.float32) # [SPKS x 120400] + spks_vec = np.zeros((len(spks),), dtype=object) # [SPKS x 1] + # speaker loop + for idx, spk in enumerate(spks): + print(f"\n--- speaker {spk} in corpus {corpus} ---") + freq, N = dict.fromkeys(range(1, 320**2+1), 0), 0 + processed_files = [] + # extract frequencies per speaker + with open(lst_path, 'r') as tsv: + rows = tsv.readlines() + for row in rows: + uttID, audio_path = row.split() + corpus_match = ('_').join(audio_path.split('/')[1].split('_')[1:]) # DATA/data_corpus_speakingstyle/spk/*wav + spk_match = audio_path.split('/')[2] # DATA/data_corpus_speakingstyle/spk/*wav + if corpus == corpus_match and spk == spk_match: + print('\nread wav-file {}'.format(audio_path)) + freq, Nwav = calc_codebook_indexes(audio_path, freq, N) + N = N + Nwav + processed_files.append(audio_path) + print(f'(DONE) Found {N} observations for speaker {spk} ...') + # combine frequencies + freqN_vec = np.zeros((1, 320**2), dtype=np.float32) + os.system('mkdir -p {}'.format(os.path.join(exp_path, 'txt', corpus))) + os.system('mkdir -p {}'.format(os.path.join(exp_path, 'numpy', corpus))) + with open(os.path.join(exp_path, 'txt', corpus, 'freq_{}.txt'.format(spk)), 'w') as ffreq, \ + open(os.path.join(exp_path, 'txt', corpus, 'freqN_{}.txt'.format(spk)), 'w') as ffreqN: + for i, code_entry in enumerate(freq.keys()): + ffreq.write('{}\t{}\n'.format(code_entry, freq[code_entry])) + ffreqN.write('{}\t{}\n'.format(code_entry, freq[code_entry]/N)) + freqN_vec[0,i] = freq[code_entry]/N + np.save(os.path.join(exp_path, 'numpy', corpus, 'freqN_{}'.format(spk,style)), freqN_vec) + # combine frequencies per corpus + print(f"... speaker {spk} is column {idx} of array freq_{corpus}.npy!") + freqN_spks[idx,:] = freqN_vec + spks_vec[idx] = f'{spk}{style}' + # write combined frequencies + np.save(os.path.join(exp_path, 'numpy', corpus, 'freq_{}'.format(corpus)), freqN_spks) + np.save(os.path.join(exp_path, 'numpy', corpus, 'spkIDs_{}'.format(corpus)), spks_vec) + +if __name__ == "__main__": + ############################# EXP PATH ########################### + try: + exp_path = sys.argv[1] + print("\nexp path is: " + exp_path) + except: + print("ERROR: data_path not specified") + ############################# LIST PATH ########################### + try: + lst_path = sys.argv[2] + print("list path is: " + lst_path) + except: + print("ERROR: lst_path not specified") + ############################# JSON PATH ########################### + try: + json_path = sys.argv[3] + print("json path is: " + json_path) + except: + print("ERROR: json_path not specified") + ############################# MODEL PATH ########################### + try: + model_path = sys.argv[4] + print("model path is: " + model_path) + except: + print("ERROR: model_path not specified") + ############################# VERBOSE ############################# + global VERBOSE + try: + VERBOSE = int(sys.argv[5]) + print("VERBOSE is " + str(VERBOSE) + "\n") + except: + print("VERBOSE is not specified, default is 0!") + VERBOSE = 0 + + main(exp_path, lst_path, json_path, model_path, VERBOSE) diff --git a/local/create_xlsr_new.py b/local/create_xlsr_new.py new file mode 100644 index 0000000..0b0589a --- /dev/null +++ b/local/create_xlsr_new.py @@ -0,0 +1,23 @@ +# https://github.com/facebookresearch/fairseq/issues/3741 + +from omegaconf import DictConfig, OmegaConf, open_dict +import torch + +cp_path = 'model/xlsr_53_56k.pt' +cp = torch.load(cp_path) +cfg = DictConfig(cp['cfg']) +dd = OmegaConf.to_container(cfg, resolve=True) +for k,v in dd.items(): + if not isinstance(v, dict): + continue + for key, _ in v.items(): + if key.split("_")[:2] == ["eval", "wer"]: + print(k,key) +with open_dict(cfg): + cfg.task.pop('eval_wer') + cfg.task.pop('eval_wer_config') + cfg.task.pop('eval_wer_tokenizer') + cfg.task.pop('eval_wer_post_process') + cfg.task.pop('autoregressive') +cp['cfg'] = cfg +torch.save(cp, 'model/xlsr_53_56k_new.pt') \ No newline at end of file diff --git a/local/plot_pca_similarities.py b/local/plot_pca_similarities.py new file mode 100644 index 0000000..431e072 --- /dev/null +++ b/local/plot_pca_similarities.py @@ -0,0 +1,156 @@ +# Author: Julian Linke (linke@tugraz.at) +# SPSC TU Graz (July 2023) + +import os, sys +import json +import numpy as np +import matplotlib +import matplotlib.pyplot as plt +from sklearn.metrics.pairwise import cosine_similarity +from sklearn.metrics.pairwise import euclidean_distances +from sklearn.metrics import pairwise_distances +import faiss +from mpl_toolkits.mplot3d import Axes3D +import matplotlib.patches as patches + +# List of predefined colors for each corpus (you can add more if you like) +colors = ['tab:red', 'tab:blue', + 'tab:green', 'tab:orange', + 'tab:purple', 'tab:brown', + 'tab:pink', 'tab:gray', + 'tab:olive', 'tab:cyan'] +# markersize +MS = 30 + +def main(X_path, y_path, pcaA_path, pcab_path, out_path, json_path): + # load json ... + # dict with {"corpusA_speakingstyle": [spk1 spk2,...], + # "corpusB_speakingstyle": [spk1 spk2,...], ...} + with open(json_path, 'r') as f: + corpora = json.load(f) + + X = np.load(X_path) + spks = np.load(y_path, allow_pickle=True) + + A = np.load(pcaA_path) + b = np.load(pcab_path) + print('\nspkIDs:\n{}'.format(spks)) + print('\ninput matrix:\n{}'.format(X)) + + labels = list(spks) + fig, ax = plt.subplots(figsize=(15,15)) + cax = ax.matshow(X, interpolation='nearest') + ax.grid() + plt.xticks(range(len(labels)), labels, rotation=90); + plt.yticks(range(len(labels)), labels); + fig.colorbar(cax, ticks=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, .8, .9, 1]) + fig_path = os.path.join(out_path,'{}.png'.format(X_path.split('/')[-1].replace('.npy',''))) + print('\nsave fig {}'.format(fig_path)) + plt.savefig(fig_path) + + colordict = {} + for corpus in corpora: + colordict[corpus] = [] + assert len(corpora.keys()) <= len(colors), 'Not enough colors for corpora' + for idx, corpus in enumerate(corpora): + spks_tmp = corpora[corpus] + for spk in spks_tmp: + colordict[corpus].append(colors[idx]) + print(colordict) + col = sum([lst for lst in colordict.values()], []) + + print("\nProject with PCA (3 dimensions)...") + X_proj = np.dot(X, A) + b # [styles x 320^2] * [320^2 x 2] or [styles x corpora] + [corpora x 2] + print(f"\nshape(X_proj) = {np.shape(X_proj)}") + print('mean(X_proj,0): {}'.format(np.mean(X_proj,0))) + + # 3D plot: + fig = plt.figure() + ax = fig.add_subplot(111, projection='3d') + ax.scatter(X_proj[:,0], X_proj[:,1], X_proj[:,2], c=col) + # Set the labels and title + ax.set_xlabel('PCA1') + ax.set_ylabel('PCA2') + ax.set_zlabel('PCA3') + # Create the legend + for k,v in colordict.items(): + ax.scatter([], [], [], c=v[0], label=k, alpha=1, s=MS) + # Shrink current axis by 20% + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.85, box.height]) + # Put a legend to the right of the current axis + ax.legend(loc='center left', bbox_to_anchor=(1.175, .5)) + # Save the plot + fig_path = os.path.join(out_path,'scatter3D_{}.png'.format(X_path.split('/')[-1].replace('.npy',''))) + print('\nsave fig {}'.format(fig_path)) + plt.savefig(fig_path) + fig_path = os.path.join(out_path,'scatter3D_{}.eps'.format(X_path.split('/')[-1].replace('.npy',''))) + print('save fig {}'.format(fig_path)) + plt.savefig(fig_path, format='eps') + + plt.rcParams.update({'font.size': 50}) + for proj in [(0, 1), (0, 2), (1, 2)]: + x, y = X_proj[:,proj[0]], X_proj[:,proj[1]] + fig, ax = plt.subplots(figsize=(18,16)) + + for i, txt in enumerate(spks): + print(f"speaker {txt}: x = {x[i]}, y = {y[i]}") + ax.plot(x[i], y[i], 'o', color=col[i], markersize=MS, alpha=.5, label=col[i]) + + handles, labels = plt.gca().get_legend_handles_labels() + by_label = dict(zip(labels, handles)) + + cols = list(by_label.keys()) + for old, new in zip(cols, corpora.keys()): + by_label[new] = by_label.pop(old) + ax.legend(by_label.values(), by_label.keys(), fontsize=30) + plt.xlabel(f'PCA{str(proj[0]+1)}') + plt.ylabel(f'PCA{str(proj[1]+1)}') + plt.grid() + plt.tight_layout() + fig_path = os.path.join(out_path,'scatter_proj{}_{}.png'.format(''.join([str(proj[0]),str(proj[1])]), X_path.split('/')[-1].replace('.npy',''))) + print('\nsave fig {}'.format(fig_path)) + plt.savefig(fig_path) + fig_path = os.path.join(out_path,'scatter_proj{}_{}.eps'.format(''.join([str(proj[0]),str(proj[1])]), X_path.split('/')[-1].replace('.npy',''))) + print('save fig {}'.format(fig_path)) + plt.savefig(fig_path, format='eps') + +if __name__ == "__main__": + ############################# FEATURES PATH ########################### + try: + X_path = sys.argv[1] + print("input matrix path is: " + X_path) + except: + print("ERROR: X_path not specified") + ############################# LABELS PATH ########################### + try: + y_path = sys.argv[2] + print("label vector path is: " + y_path) + except: + print("ERROR: y_path not specified") + ############################# PCA A PATH ########################### + try: + pcaA_path = sys.argv[3] + print("pca matrix A path is: " + pcaA_path) + except: + print("ERROR: pcaA_path not specified") + ############################# PCA b PATH ########################### + try: + pcab_path = sys.argv[4] + print("pca vector b path is: " + pcab_path) + except: + print("ERROR: pcab_path not specified") + ############################# OUT PATH ########################### + try: + out_path = sys.argv[5] + print("output path is: " + out_path) + except: + print("ERROR: out_path not specified") + ############################# JSON PATH ########################### + try: + json_path = sys.argv[6] + print("json path is: " + json_path) + except: + print("ERROR: json_path not specified") + + main(X_path, y_path, pcaA_path, pcab_path, out_path, json_path) diff --git a/local/prepare_data.py b/local/prepare_data.py new file mode 100644 index 0000000..1643678 --- /dev/null +++ b/local/prepare_data.py @@ -0,0 +1,53 @@ +# Author: Julian Linke (linke@tugraz.at) +# SPSC TU Graz (July 2023) + +import os +import json +import argparse + +def get_speaker_ids_and_lst_lines(corpus_dir): + spk_ids = [] + lst_lines = [] + for spk in os.listdir(corpus_dir): + spk_dir = os.path.join(corpus_dir, spk) + if os.path.isdir(spk_dir): + spk_ids.append(spk) + for audio_file in os.listdir(spk_dir): + if audio_file.endswith('.wav') or audio_file.endswith('.flac'): + uttID = audio_file.replace('.wav','').replace('.flac','') + audio_path = os.path.join(spk_dir, audio_file) + lst_line = f"{uttID} {audio_path}\n" + lst_lines.append(lst_line) + return spk_ids, lst_lines + +def process_DATA_directory(DATA_dir): + spk_dict = {} + lst_lines = [] + for corpus in os.listdir(DATA_dir): + corpus_dir = os.path.join(DATA_dir, corpus) + if os.path.isdir(corpus_dir): + corpus_name = '_'.join(corpus.split('_')[1:]) + spk_ids, new_lst_lines = get_speaker_ids_and_lst_lines(corpus_dir) + spk_dict[corpus_name] = spk_ids + lst_lines.extend(new_lst_lines) + return spk_dict, lst_lines + +def write_output_files(output_lst_path, output_json_path, lst_lines, spk_dict): + with open(output_lst_path, 'w') as f: + f.writelines(lst_lines) + + with open(output_json_path, 'w') as f: + json.dump(spk_dict, f, indent=4) + +def main(output_lst_path, output_json_path, DATA_dir): + spk_dict, lst_lines = process_DATA_directory(DATA_dir) + #print(lst_lines) + write_output_files(output_lst_path, output_json_path, lst_lines, spk_dict) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Generate DATA.lst and DATA.json files.') + parser.add_argument('--output_lst_path', required=True, help='Path to the output .lst file.') + parser.add_argument('--output_json_path', required=True, help='Path to the output .json file.') + parser.add_argument('--DATA_dir', required=True, help='Path to the DATA directory.') + args = parser.parse_args() + main(args.output_lst_path, args.output_json_path, args.DATA_dir) diff --git a/local/similarity_matrix.py b/local/similarity_matrix.py new file mode 100644 index 0000000..9912ee4 --- /dev/null +++ b/local/similarity_matrix.py @@ -0,0 +1,96 @@ +# Author: Julian Linke (linke@tugraz.at) +# SPSC TU Graz (July 2023) + +import os, sys +import json +import numpy as np +from sklearn.metrics.pairwise import cosine_similarity + +from scipy.stats import entropy +from numpy.linalg import norm + +def JSD(P, Q): + _P = P / norm(P, ord=1) + _Q = Q / norm(Q, ord=1) + _M = 0.5 * (_P + _Q) + return 1 - (0.5 * (entropy(_P, _M) + entropy(_Q, _M))) + +def JSD_similarity(X): + sim = np.zeros((np.shape(X)[0], np.shape(X)[0]), dtype=np.float32) + print('\ncalculate JSD similarity matrix of X: {} ...'.format(X.shape)) + for row, x in enumerate(X): + for col, _ in enumerate(sim): + #print('calculate JSD similarity of features ({}, {})'.format(row, col)) + sim[row, col] = JSD(x, X[col,:]) + return sim + +def main(X_path, y_path, out_path, json_path): + # load json ... + # dict with {"corpusA_speakingstyle": [spk1 spk2,...], + # "corpusB_speakingstyle": [spk1 spk2,...], ...} + with open(json_path, 'r') as f: + corpora = json.load(f) + + # LOAD MATRIX AND SPLITS + X = np.load(X_path) + splits = np.load(y_path, allow_pickle=True) + print('\ninput splits: {}\n... len: {}'.format(splits, len(splits))) + corpora_lengths = {key: len(value) for key, value in corpora.items()} + + # SORT: + d_splits, col = {}, [] + for idx, split in enumerate(splits): + d_splits[split] = X[idx,:] + X = np.zeros((np.shape(X)), dtype=np.float32) + splits = np.zeros((np.shape(splits)), dtype=object) + i = 0 + for corpus in corpora: + for split in corpora[corpus]: + style = corpus.split("_")[1] # second entry is always style + split = f'{split}{style}' + if split in d_splits: + X[i,:] = d_splits[split] + splits[i] = split + i = i + 1 + else: print('Wrong corpora entry: {}?'.format(split)) + print('\nsorted splits: {}\n... len: {}'.format(splits, len(splits))) + + print('\nwrite sorted {} and {}'.format(X_path, y_path)) + np.save(X_path, X) + np.save(y_path, splits) + print('write sorted {} and {}'.format(X_path, y_path)) + np.savetxt(X_path.replace('.npy','.tsv'), X, delimiter='\t') + np.savetxt(y_path.replace('.npy','.tsv'), splits, fmt='%s', delimiter='\t') + + # Calculate similarity matrix and save: + X_sim = JSD_similarity(X) + np.save(os.path.join(out_path,'similarity_matrix'), X_sim) + np.savetxt(os.path.join(out_path,'similarity_matrix.tsv'), X_sim, delimiter='\t') + +if __name__ == "__main__": + ############################# FEATURES PATH ########################### + try: + X_path = sys.argv[1] + print("input matrix path is: " + X_path) + except: + print("ERROR: X_path not specified") + ############################# LABELS PATH ########################### + try: + y_path = sys.argv[2] + print("label vector path is: " + y_path) + except: + print("ERROR: y_path not specified") + ############################# OUT PATH ########################### + try: + out_path = sys.argv[3] + print("output path is: " + out_path) + except: + print("ERROR: out_path not specified") + ############################# JSON PATH ########################### + try: + json_path = sys.argv[4] + print("json path is: " + json_path) + except: + print("ERROR: json_path not specified") + + main(X_path, y_path, out_path, json_path) diff --git a/path.sh b/path.sh new file mode 100755 index 0000000..d7c0ec1 --- /dev/null +++ b/path.sh @@ -0,0 +1,2 @@ +export CWD=$(pwd) +export FAIRSEQ=../fairseq \ No newline at end of file diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..47b0e31 --- /dev/null +++ b/run.sh @@ -0,0 +1,93 @@ +#!/bin/bash +#set -x + +# Author: Julian Linke (linke@tugraz.at) +# SPSC TU Graz (July 2023) + +set -e -o pipefail +. path.sh +. conda.sh + +if [[ $# -eq 0 ]] ; then + echo 'ERROR: this run-script requires an argument: stage=?' + exit 1 +fi + +## runDATA/STAGE +runDATA=$1 +stage=$2 +VERBOSE=1 # write logs for codebook frequency extraction? +printf "\n### STAGE ###\n" +printf "stage: %d\n" $stage +printf "### STAGE ###\n" + +## DIRS/PATHS +model_path=model/xlsr_53_56k_new.pt +exp_dir=exp_$runDATA + +## STAGE 0: DELETE AND RUN ALL STAGES +if [ $stage == 0 ]; then + printf "\n... Delete old experiment and run all ...\n" + rm -rf ${exp_dir} +fi + +## print: +printf "\nCWD: %s" "$CWD" +printf "\nFAIRSEQ: %s" "$FAIRSEQ" +printf "\nrunDATA: %s" "$runDATA" +printf "\nmodel_path: %s" "$model_path" +printf "\nexp_dir: %s\n\n" "$exp_dir" + +## CREATE EXPERIMENT FOLDER +mkdir -p $exp_dir +mkdir -p $exp_dir/logs +mkdir -p $exp_dir/data +mkdir -p $exp_dir/plots +mkdir -p $exp_dir/txt +mkdir -p $exp_dir/numpy +mkdir -p $exp_dir/numpy/pca + +## PREPARE DATA +if [ $stage == 1 ] || [ $stage == 0 ]; then + printf "\n... Prepare data (*lst and *json) ...\n" + python3 local/prepare_data.py --output_lst_path $exp_dir/data/${runDATA}.lst \ + --output_json_path $exp_dir/data/${runDATA}.json \ + --DATA_dir ${runDATA} +fi + +if [ $stage == 2 ] || [ $stage == 0 ]; then + printf "\n... Count frequencies of codebooks ...\n" + python3 local/codebook_freqs.py $exp_dir \ + $exp_dir/data/${runDATA}.lst \ + $exp_dir/data/${runDATA}.json \ + $model_path \ + $VERBOSE + printf "\n... Combine Arrays ... \n" + python3 local/codebook_combine_arrays.py $exp_dir/numpy \ + $exp_dir/data/${runDATA}.json +fi + +if [ $stage == 3 ] || [ $stage == 0 ]; then + printf "\n... Similarity Matrix ...\n" + python3 local/similarity_matrix.py \ + $exp_dir/numpy/splits_freqs.npy \ + $exp_dir/numpy/splits_labels.npy \ + $exp_dir/numpy \ + $exp_dir/data/${runDATA}.json +fi + +if [ $stage == 4 ] || [ $stage == 0 ]; then + printf "\n... PCA of similarity matrix ...\n" + python3 $FAIRSEQ/examples/wav2vec/unsupervised/scripts/pca.py \ + $exp_dir/numpy/similarity_matrix.npy \ + --output $exp_dir/numpy/pca \ + --dim 3 + printf "\n... PLOT ...\n" + python3 local/plot_pca_similarities.py \ + $exp_dir/numpy/similarity_matrix.npy \ + $exp_dir/numpy/splits_labels.npy \ + $exp_dir/numpy/pca/3_pca_A.npy \ + $exp_dir/numpy/pca/3_pca_b.npy \ + $exp_dir/plots \ + $exp_dir/data/${runDATA}.json +fi \ No newline at end of file -- GitLab